Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5059,6 +5059,20 @@ return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } +// Return true if the instruction zeroes the unused upper part of the +// destination and accepts mask. +static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) { + switch (Opcode) { + default: + return false; + case X86ISD::PCMPEQM: + case X86ISD::PCMPGTM: + case X86ISD::CMPM: + case X86ISD::CMPMU: + return true; + } +} + /// Insert i1-subvector to i1-vector. static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -5091,6 +5105,22 @@ // 3. Subvector should be inserted in the middle (for example v2i1 // to v16i1, index 2) + // If this node widens - by concatenating zeroes - the type of the result + // of a node with instruction that zeroes all upper (irrelevant) bits of the + // output register, mark this node as legal to enable replacing them with + // the v8i1 version of the previous instruction during instruction selection. + // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg, + // while zeroing all the upper remaining 60 bits of the register. if the + // result of such instruction is inserted into an allZeroVector, then we can + // safely remove insert_vector (in instruction selection) as the cmp instr + // already zeroed the rest of the register. + if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 && + (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) || + (SubVec.getOpcode() == ISD::AND && + (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) || + isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode()))))) + return Op; + // extend to natively supported kshift MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; MVT WideOpVT = OpVT; @@ -7913,6 +7943,60 @@ return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } +// Return true if all the operands of the given CONCAT_VECTORS node are zeros +// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0) +static bool isExpandWithZeros(const SDValue &Op) { + assert(Op.getOpcode() == ISD::CONCAT_VECTORS && + "Expand with zeros only possible in CONCAT_VECTORS nodes!"); + + for (unsigned i = 1; i < Op.getNumOperands(); i++) + if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode())) + return false; + + return true; +} + +// Returns true if the given node is a type promotion (by concatenating i1 +// zeros) of the result of a node that already zeros all upper bits of +// k-register. +static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) { + unsigned Opc = Op.getOpcode(); + + assert(Opc == ISD::CONCAT_VECTORS && + Op.getSimpleValueType().getVectorElementType() == MVT::i1 && + "Unexpected node to check for type promotion!"); + + // As long as we are concatenating zeros to the upper part of a previous node + // result, climb up the tree until a node with different opcode is + // encountered + while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) { + if (Opc == ISD::INSERT_SUBVECTOR) { + if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) && + Op.getConstantOperandVal(2) == 0) + Op = Op.getOperand(1); + else + return SDValue(); + } else { // Opc == ISD::CONCAT_VECTORS + if (isExpandWithZeros(Op)) + Op = Op.getOperand(0); + else + return SDValue(); + } + Opc = Op.getOpcode(); + } + + // Check if the first inserted node zeroes the upper bits, or an 'and' result + // of a node that zeros the upper bits (its masked version). + if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) || + (Op.getOpcode() == ISD::AND && + (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) || + isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) { + return Op; + } + + return SDValue(); +} + static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG & DAG) { @@ -7923,6 +8007,17 @@ assert(isPowerOf2_32(NumOfOperands) && "Unexpected number of operands in CONCAT_VECTORS"); + // If this node promotes - by concatenating zeroes - the type of the result + // of a node with instruction that zeroes all upper (irrelevant) bits of the + // output register, mark it as legal and catch the pattern in instruction + // selection to avoid emitting extra insturctions (for zeroing upper bits). + if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) { + SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64); + SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted, + ZeroC); + } + SDValue Undef = DAG.getUNDEF(ResVT); if (NumOfOperands > 2) { // Specialize the cases when all, or all but one, of the operands are undef. Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -185,6 +185,20 @@ def avx512vl_f64_info : AVX512VLVectorVTInfo; +class X86KVectorVTInfo { + RegisterClass KRC = _krc; + RegisterClass KRCWM = _krcwm; + ValueType KVT = _vt; +} + +def v2i1_info : X86KVectorVTInfo; +def v4i1_info : X86KVectorVTInfo; +def v8i1_info : X86KVectorVTInfo; +def v16i1_info : X86KVectorVTInfo; +def v32i1_info : X86KVectorVTInfo; +def v64i1_info : X86KVectorVTInfo; + // This multiclass generates the masking variants from the non-masking // variant. It only provides the assembly pieces for the masking variants. // It assumes custom ISel patterns for masking which can be provided as @@ -1735,17 +1749,217 @@ avx512vl_i64_info, HasAVX512>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; -let Predicates = [HasAVX512, NoVLX] in { -def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (COPY_TO_REGCLASS (VPCMPGTDZrr - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>; -def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (COPY_TO_REGCLASS (VPCMPEQDZrr - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>; -} +multiclass avx512_icmp_packed_lowering Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rr) _.RC:$src1, _.RC:$src2), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rm) _.RC:$src1, addr:$src2), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rrk) _.KRCWM:$mask, + _.RC:$src1, _.RC:$src2), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert + (_.LdFrag addr:$src2))))))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmk) _.KRCWM:$mask, + _.RC:$src1, addr:$src2), + NewInf.KRC)>; +} +} + +multiclass avx512_icmp_packed_rmb_lowering Preds> + : avx512_icmp_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmb) _.RC:$src1, addr:$src2), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmbk) _.KRCWM:$mask, + _.RC:$src1, addr:$src2), + NewInf.KRC)>; +} +} + +// VPCMPEQB - i8 +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; + +defm : avx512_icmp_packed_lowering; + +// VPCMPEQW - i16 +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; + +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; + +defm : avx512_icmp_packed_lowering; + +// VPCMPEQD - i32 +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +// VPCMPEQQ - i64 +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +// VPCMPGTB - i8 +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; + +defm : avx512_icmp_packed_lowering; + +// VPCMPGTW - i16 +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; + +defm : avx512_icmp_packed_lowering; +defm : avx512_icmp_packed_lowering; + +defm : avx512_icmp_packed_lowering; + +// VPCMPGTD - i32 +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +// VPCMPGTQ - i64 +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; + +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; +defm : avx512_icmp_packed_rmb_lowering; multiclass avx512_icmp_cc opc, string Suffix, SDNode OpNode, X86VectorVTInfo _> { @@ -1908,6 +2122,237 @@ defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info, HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +multiclass avx512_icmp_cc_packed_lowering Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rri) _.RC:$src1, + _.RC:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmi) _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rrik) _.KRCWM:$mask, + _.RC:$src1, + _.RC:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert + (_.LdFrag addr:$src2))), + imm:$cc)))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmik) _.KRCWM:$mask, + _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; +} +} + +multiclass avx512_icmp_cc_packed_rmb_lowering Preds> + : avx512_icmp_cc_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmib) _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + imm:$cc)))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmibk) _.KRCWM:$mask, + _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; +} +} + +// VPCMPB - i8 +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; + +defm : avx512_icmp_cc_packed_lowering; + +// VPCMPW - i16 +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; + +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; + +defm : avx512_icmp_cc_packed_lowering; + +// VPCMPD - i32 +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +// VPCMPQ - i64 +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +// VPCMPUB - i8 +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; + +defm : avx512_icmp_cc_packed_lowering; + +// VPCMPUW - i16 +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; + +defm : avx512_icmp_cc_packed_lowering; +defm : avx512_icmp_cc_packed_lowering; + +defm : avx512_icmp_cc_packed_lowering; + +// VPCMPUD - i32 +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +// VPCMPUQ - i64 +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; +defm : avx512_icmp_cc_packed_rmb_lowering; + multiclass avx512_vcmp_common { defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, @@ -1998,21 +2443,108 @@ defm VCMPPS : avx512_vcmp, AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; -def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)), - (COPY_TO_REGCLASS (VCMPPSZrri - (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), - imm:$cc), VK8)>; -def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), - (COPY_TO_REGCLASS (VPCMPDZrri - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), - imm:$cc), VK8)>; -def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), - (COPY_TO_REGCLASS (VPCMPUDZrri - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), - imm:$cc), VK8)>; +multiclass avx512_fcmp_cc_packed_lowering Preds> { +let Predicates = Preds in { + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (X86cmpm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rri) _.RC:$src1, + _.RC:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (X86cmpm (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmi) _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; + + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (X86cmpm (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + imm:$cc)), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rmbi) _.RC:$src1, + addr:$src2, + imm:$cc), + NewInf.KRC)>; +} +} + +multiclass avx512_fcmp_cc_packed_sae_lowering Preds> + : avx512_fcmp_cc_packed_lowering<_, NewInf, InstrStr, Preds> { + +let Predicates = Preds in + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), + (_.KVT (X86cmpmRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc, + (i32 FROUND_NO_EXC))), + (i64 0)), + (COPY_TO_REGCLASS (!cast(InstrStr##rrib) _.RC:$src1, + _.RC:$src2, + imm:$cc), + NewInf.KRC)>; +} + + +// VCMPPS - f32 +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; + +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; + +defm : avx512_fcmp_cc_packed_sae_lowering; +defm : avx512_fcmp_cc_packed_sae_lowering; + +// VCMPPD - f64 +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; + +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; +defm : avx512_fcmp_cc_packed_lowering; + +defm : avx512_fcmp_cc_packed_sae_lowering; +defm : avx512_fcmp_cc_packed_sae_lowering; +defm : avx512_fcmp_cc_packed_sae_lowering; // ---------------------------------------------------------------- // FPClass @@ -2498,6 +3030,69 @@ defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl>; defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr>; +multiclass axv512_icmp_packed_no_vlx_lowering { +def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (COPY_TO_REGCLASS (!cast(InstStr##Zrr) + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>; + +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (i64 0)), + (KSHIFTRWri (KSHIFTLWri (!cast(InstStr##Zrr) + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + (i8 8)), (i8 8))>; + +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (and VK8:$mask, + (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))), + (i64 0)), + (KSHIFTRWri (KSHIFTLWri (!cast(InstStr##Zrrk) + (COPY_TO_REGCLASS VK8:$mask, VK16), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + (i8 8)), (i8 8))>; +} + +multiclass axv512_icmp_packed_cc_no_vlx_lowering { +def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)), + (COPY_TO_REGCLASS (!cast(InstStr##Zrri) + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), + imm:$cc), VK8)>; + +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)), + (i64 0)), + (KSHIFTRWri (KSHIFTLWri (!cast(InstStr##Zrri) + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), + imm:$cc), + (i8 8)), (i8 8))>; + +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (and VK8:$mask, + (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc))), + (i64 0)), + (KSHIFTRWri (KSHIFTLWri (!cast(InstStr##Zrrik) + (COPY_TO_REGCLASS VK8:$mask, VK16), + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), + imm:$cc), + (i8 8)), (i8 8))>; +} + +let Predicates = [HasAVX512, NoVLX] in { + defm : axv512_icmp_packed_no_vlx_lowering; + defm : axv512_icmp_packed_no_vlx_lowering; + + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; +} + // Mask setting all 0s or 1s multiclass avx512_mask_setop { let Predicates = [HasAVX512] in Index: test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -1004,8 +1004,6 @@ ; CHECK-LABEL: test_pcmpeq_q_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1018,8 +1016,6 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x29,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1058,8 +1054,6 @@ ; CHECK-LABEL: test_pcmpgt_q_256: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x37,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1072,8 +1066,6 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x37,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1087,8 +1079,6 @@ ; CHECK-LABEL: test_pcmpeq_d_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1101,8 +1091,6 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x76,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1116,10 +1104,6 @@ ; CHECK-LABEL: test_pcmpeq_q_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1] -; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e] -; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1132,10 +1116,6 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x29,0xc1] -; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e] -; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1149,8 +1129,6 @@ ; CHECK-LABEL: test_pcmpgt_d_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x66,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1163,8 +1141,6 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x66,0xc1] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1178,10 +1154,6 @@ ; CHECK-LABEL: test_pcmpgt_q_128: ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x37,0xc1] -; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e] -; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -1194,10 +1166,6 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x37,0xc1] -; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e] -; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e] -; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] -; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] Index: test/CodeGen/X86/avx512vl-vec-masked-cmp.ll =================================================================== --- test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -0,0 +1,55138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=NoVLX + +define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi0: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi2: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi3: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi4: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi5: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi6: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi7: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqb (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi8: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi9: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi10: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi11: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi12: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi13: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi14: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi15: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi16: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi17: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi18: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi19: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi20: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi21: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi22: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi23: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi24: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi25: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi26: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi27: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi28: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi29: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi30: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi31: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi32: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi33: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi34: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi35: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi36: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi37: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi38: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi39: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqb (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi40: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi41: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi42: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi43: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi44: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi45: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi46: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi47: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi48: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi49: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi50: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi51: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi52: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi53: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi54: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi55: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi56: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi57: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi58: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi59: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi60: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi61: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi62: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi63: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp eq <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi64: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi65: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi66: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp eq <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqb (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi67: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi68: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi69: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp eq <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi70: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi71: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi72: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 +; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp eq <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqb (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi73: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi74: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi75: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; NoVLX-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp eq <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi76: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi77: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi78: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi79: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi80: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi81: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi82: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi83: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi84: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi85: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi86: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi87: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi88: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi89: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi90: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi91: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi92: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi93: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi94: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi95: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi96: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi97: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi98: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi99: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp eq <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi100: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi101: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi102: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi103: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi104: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi105: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi106: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi107: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi108: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi109: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi110: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi111: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi112: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi113: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi114: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi115: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi116: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi117: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi118: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi119: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi120: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi121: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi122: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi123: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi124: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi125: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi126: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi127: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi128: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi129: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi130: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi131: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi132: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi133: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi134: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi135: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi136: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi137: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi138: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi139: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi140: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi141: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi142: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi143: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi144: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi145: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi146: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi147: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi148: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi149: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi150: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi151: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi152: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi153: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi154: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi155: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi156: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi157: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi158: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi159: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi160: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi161: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi162: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi163: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp eq <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi164: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi165: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi166: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp eq <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqw (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi167: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi168: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi169: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp eq <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi170: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi171: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi172: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} +; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm6, %xmm6 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm8, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp eq <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqw (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi173: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi174: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi175: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm2 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm4, %ymm4 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4 +; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4 +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp eq <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi176: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi177: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi178: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi179: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi180: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi181: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi182: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi183: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi184: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi185: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi186: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi187: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi188: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi189: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi190: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi191: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi192: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi193: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi194: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi195: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi196: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi197: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi198: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi199: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi200: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi201: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi202: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi203: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi204: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi205: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi206: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi207: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi208: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi209: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi210: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi211: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi212: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi213: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi214: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi215: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi216: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi217: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi218: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi219: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi220: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi221: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi222: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi223: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi224: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi225: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi226: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi227: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi228: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi229: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi230: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi231: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi232: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi233: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi234: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi235: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi236: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi237: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi238: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi239: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi240: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi241: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi242: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi243: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi244: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi245: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi246: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi247: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi248: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi249: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi250: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi251: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi252: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi253: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi254: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi255: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi256: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi257: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi258: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi259: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi260: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi261: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi262: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi263: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi264: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi265: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi266: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi267: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi268: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi269: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi270: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi271: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi272: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi273: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi274: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi275: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi276: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi277: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi278: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi279: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi280: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi281: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi282: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi283: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi284: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi285: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi286: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi287: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi288: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi289: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi290: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi291: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi292: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi293: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi294: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi295: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi296: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi297: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi298: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi299: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi300: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi301: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi302: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi303: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi304: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi305: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi306: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi307: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi308: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi309: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi310: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi311: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi312: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi313: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi314: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi315: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi316: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi317: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi318: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi319: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi320: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi321: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi322: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi323: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi324: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi325: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi326: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi327: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi328: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi329: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi330: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi331: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi332: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi333: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi334: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi335: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi336: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi337: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi338: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi339: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi340: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi341: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi342: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi343: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + +define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi344: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi345: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi346: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi347: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi348: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi349: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi350: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi351: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi352: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi353: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi354: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi355: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi356: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi357: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi358: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi359: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi360: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi361: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi362: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi363: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi364: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi365: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi366: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi367: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi368: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi369: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi370: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi371: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi372: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi373: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi374: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi375: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi376: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi377: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi378: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi379: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp eq <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi380: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi381: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi382: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi383: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi384: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi385: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi386: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi387: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi388: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi389: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi390: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi391: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi392: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi393: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi394: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi395: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi396: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi397: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi398: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi399: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi400: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi401: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi402: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi403: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi404: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi405: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi406: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi407: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi408: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi409: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi410: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi411: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi412: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi413: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi414: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi415: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp eq <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi416: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi417: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi418: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi419: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi420: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi421: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi422: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi423: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi424: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi425: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi426: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi427: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi428: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi429: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi430: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi431: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi432: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi433: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi434: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi435: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi436: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi437: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi438: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi439: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi440: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi441: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi442: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi443: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi444: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi445: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi446: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi447: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi448: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp eq <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi449: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi450: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi451: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp eq <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi452: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi453: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi454: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi455: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi456: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi457: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi458: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi459: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi460: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi461: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi462: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi463: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi464: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi465: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi466: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi467: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi468: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi469: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi470: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi471: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi472: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi473: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi474: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi475: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi476: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi477: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi478: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi479: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi480: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi481: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi482: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi483: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi484: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi485: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi486: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi487: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi488: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi489: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi490: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi491: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi492: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi493: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi494: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi495: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi496: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi497: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi498: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi499: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi500: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi501: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi502: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi503: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi504: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi505: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi506: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi507: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi508: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi509: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi510: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi511: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi512: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi513: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi514: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi515: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sgt <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi516: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi517: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi518: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp sgt <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtb (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi519: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi520: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi521: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp sgt <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi522: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi523: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi524: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 +; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; NoVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp sgt <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtb (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi525: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi526: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi527: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; NoVLX-NEXT: vpcmpgtb (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp sgt <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi528: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi529: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi530: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi531: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi532: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi533: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi534: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi535: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi536: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi537: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi538: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi539: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi540: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi541: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi542: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi543: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi544: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi545: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi546: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi547: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi548: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi549: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi550: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi551: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sgt <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi552: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi553: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi554: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi555: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi556: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi557: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi558: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi559: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi560: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi561: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi562: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi563: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi564: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi565: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi566: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi567: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi568: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi569: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi570: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi571: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi572: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi573: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi574: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi575: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi576: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi577: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi578: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi579: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi580: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi581: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi582: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi583: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi584: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi585: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi586: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi587: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi588: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi589: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi590: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi591: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi592: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi593: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi594: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi595: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi596: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi597: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi598: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi599: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi600: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi601: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi602: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi603: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi604: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi605: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi606: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi607: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi608: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi609: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi610: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi611: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi612: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi613: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi614: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi615: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sgt <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi616: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi617: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi618: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp sgt <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtw (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi619: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi620: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi621: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp sgt <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi622: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi623: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi624: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm6, %xmm6 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm8, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp sgt <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtw (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi625: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi626: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi627: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm2 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm4, %ymm4 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4 +; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4 +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp sgt <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi628: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi629: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi630: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi631: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi632: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi633: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi634: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi635: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi636: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi637: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi638: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi639: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi640: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi641: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi642: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi643: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi644: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi645: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi646: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi647: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi648: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi649: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi650: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi651: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi652: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi653: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi654: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi655: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi656: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi657: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi658: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi659: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi660: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi661: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi662: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi663: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sgt <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi664: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi665: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi666: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi667: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi668: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi669: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi670: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi671: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi672: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi673: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi674: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi675: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi676: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi677: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi678: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi679: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi680: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi681: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi682: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi683: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi684: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi685: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi686: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi687: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi688: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi689: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi690: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi691: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi692: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi693: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi694: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi695: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi696: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi697: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi698: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi699: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi700: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi701: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi702: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi703: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi704: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi705: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi706: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi707: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi708: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi709: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi710: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi711: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi712: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi713: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi714: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi715: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi716: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi717: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi718: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi719: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi720: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi721: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi722: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi723: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi724: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi725: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi726: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi727: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi728: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi729: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi730: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi731: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi732: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi733: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi734: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi735: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi736: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi737: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi738: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi739: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi740: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi741: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi742: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi743: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi744: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi745: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi746: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi747: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi748: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi749: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi750: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi751: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi752: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi753: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi754: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi755: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi756: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi757: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi758: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi759: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi760: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi761: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi762: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi763: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi764: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi765: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi766: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi767: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi768: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi769: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi770: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi771: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi772: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi773: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi774: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi775: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi776: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi777: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi778: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi779: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi780: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi781: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi782: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi783: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi784: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi785: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi786: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi787: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi788: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi789: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi790: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi791: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi792: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi793: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi794: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi795: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sgt <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + +define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi796: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi797: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi798: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi799: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi800: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi801: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi802: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi803: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi804: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi805: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi806: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi807: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi808: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi809: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi810: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi811: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi812: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi813: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi814: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi815: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi816: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi817: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi818: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi819: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi820: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi821: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi822: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi823: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi824: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi825: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi826: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi827: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi828: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi829: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi830: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi831: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sgt <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi832: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi833: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi834: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi835: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi836: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi837: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi838: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi839: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi840: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi841: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi842: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi843: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi844: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi845: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi846: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi847: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi848: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi849: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi850: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi851: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi852: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi853: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi854: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi855: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi856: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi857: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi858: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi859: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi860: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi861: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi862: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi863: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi864: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi865: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi866: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi867: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi868: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi869: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi870: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi871: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi872: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi873: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi874: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi875: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi876: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi877: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi878: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi879: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi880: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi881: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi882: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi883: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi884: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi885: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi886: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi887: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi888: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi889: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi890: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi891: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi892: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi893: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi894: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi895: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi896: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi897: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi898: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi899: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi900: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi901: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi902: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi903: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sgt <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi904: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi905: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi906: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi907: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi908: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi909: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi910: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi911: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltb (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi912: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi913: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi914: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi915: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi916: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi917: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi918: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi919: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi920: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi921: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi922: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi923: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi924: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi925: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi926: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi927: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi928: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi929: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi930: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi931: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi932: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi933: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi934: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi935: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi936: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi937: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi938: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi939: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi940: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi941: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi942: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi943: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltb (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi944: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi945: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi946: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi947: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi948: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi949: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi950: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi951: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi952: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi953: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi954: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi955: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi956: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi957: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi958: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi959: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi960: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi961: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi962: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi963: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi964: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi965: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi966: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi967: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp sge <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi968: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi969: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi970: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp sge <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltb (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi971: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi972: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi973: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp sge <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi974: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi975: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi976: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 +; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp sge <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltb (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi977: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi978: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi979: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm4 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp sge <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi980: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi981: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi982: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi983: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi984: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi985: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi986: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi987: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi988: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi989: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi990: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi991: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi992: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi993: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi994: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi995: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi996: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi997: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi998: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi999: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1000: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1001: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1002: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1003: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp sge <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1004: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1005: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1006: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1007: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1008: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1009: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1010: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1011: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1012: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1013: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1014: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1015: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1016: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1017: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1018: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1019: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1020: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1021: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1022: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1023: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1024: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1025: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1026: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1027: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1028: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1029: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1030: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1031: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1032: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1033: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1034: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1035: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1036: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1037: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1038: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1039: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1040: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1041: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1042: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1043: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1044: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1045: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1046: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1047: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1048: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1049: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1050: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1051: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1052: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1053: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1054: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1055: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1056: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1057: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1058: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1059: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1060: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1061: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1062: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1063: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1064: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1065: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1066: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1067: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp sge <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1068: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1069: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1070: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm2 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp sge <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltw (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1071: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1072: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1073: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp sge <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1074: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1075: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1076: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm6, %xmm6 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm8, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 +; NoVLX-NEXT: vpxor %ymm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3 +; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp sge <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltw (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1077: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1078: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1079: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm2 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm5 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm5, %ymm3 +; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm5 +; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4 +; NoVLX-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 +; NoVLX-NEXT: vpxor %ymm5, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4 +; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4 +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp sge <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1080: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1081: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1082: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1083: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1084: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1085: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1086: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1087: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1088: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1089: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1090: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1091: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1092: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1093: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1094: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1095: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1096: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1097: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1098: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1099: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1100: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1101: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1102: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1103: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1104: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1105: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1106: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1107: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1108: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1109: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1110: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1111: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1112: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1113: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1114: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1115: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1116: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1117: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1118: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1119: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1120: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1121: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1122: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1123: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1124: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1125: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1126: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1127: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1128: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1129: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1130: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1131: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1132: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1133: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1134: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1135: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1136: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1137: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1138: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1139: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1140: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1141: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1142: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1143: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1144: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1145: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1146: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1147: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1148: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1149: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1150: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1151: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1152: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1153: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1154: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1155: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1156: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1157: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1158: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1159: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1160: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1161: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1162: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1163: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1164: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1165: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1166: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1167: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1168: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1169: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1170: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1171: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1172: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1173: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1174: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1175: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1176: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1177: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1178: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1179: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1180: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1181: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1182: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1183: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %zmm1 +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1184: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1185: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1186: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1187: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1188: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1189: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1190: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1191: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %zmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1192: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1193: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1194: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1195: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1196: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1197: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1198: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1199: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1200: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1201: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1202: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1203: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1204: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1205: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1206: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1207: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1208: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1209: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1210: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1211: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1212: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1213: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1214: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1215: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1216: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1217: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1218: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1219: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1220: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1221: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1222: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1223: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1224: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1225: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1226: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1227: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1228: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1229: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1230: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1231: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rdi), %zmm1 +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1232: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1233: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1234: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1235: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1236: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1237: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1238: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1239: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd (%rsi), %zmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1240: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1241: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1242: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1243: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1244: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1245: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1246: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1247: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + +define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1248: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1249: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1250: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1251: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1252: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1253: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1254: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1255: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1256: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1257: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1258: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1259: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1260: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1261: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1262: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1263: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1264: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1265: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1266: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1267: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1268: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1269: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1270: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1271: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1272: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1273: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1274: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1275: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1276: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1277: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1278: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1279: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1280: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1281: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1282: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1283: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp sge <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1284: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1285: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1286: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1287: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1288: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1289: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1290: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1291: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1292: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1293: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1294: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1295: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1296: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1297: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1298: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1299: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1300: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1301: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1302: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1303: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1304: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1305: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1306: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1307: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1308: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1309: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1310: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1311: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1312: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1313: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1314: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1315: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1316: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1317: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1318: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1319: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp sge <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1320: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1321: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1322: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1323: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1324: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1325: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1326: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1327: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1328: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1329: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1330: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1331: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1332: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1333: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1334: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1335: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1336: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1337: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1338: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1339: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1340: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1341: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1342: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1343: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1344: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1345: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1346: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1347: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1348: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1349: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1350: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1351: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1352: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sge <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1 +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1353: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1354: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1355: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp sge <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1356: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1357: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1358: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1359: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1360: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1361: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1362: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1363: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltub (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1364: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1365: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1366: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1367: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1368: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1369: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1370: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1371: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1372: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1373: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1374: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1375: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1376: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1377: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1378: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1379: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1380: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1381: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1382: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1383: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1384: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1385: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1386: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1387: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1388: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1389: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1390: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1391: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1392: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1393: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1394: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1395: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltub (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1396: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1397: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1398: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1399: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1400: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1401: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1402: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1403: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1404: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1405: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1406: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1407: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1408: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1409: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1410: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1411: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %1 = bitcast <2 x i64> %__b to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1412: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1413: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1414: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1415: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1416: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1417: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1418: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1419: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1420: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1421: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1422: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp ult <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltub (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1423: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1424: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1425: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp ult <32 x i8> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1426: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1427: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1428: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 +; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %ymm5, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm5, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %1 = bitcast <4 x i64> %__b to <32 x i8> + %2 = icmp ult <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltub (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1429: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1430: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1431: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm4, %ymm4 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <32 x i8> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <32 x i8> + %2 = icmp ult <32 x i8> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1432: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1433: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1434: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1435: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1436: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1437: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1438: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1439: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1440: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1441: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1442: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1443: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1444: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1445: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1446: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1447: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1448: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1449: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1450: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1451: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1452: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %1 = bitcast <2 x i64> %__b to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1453: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1454: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1455: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <8 x i16> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <8 x i16> + %2 = icmp ult <8 x i16> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1456: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1457: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1458: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1459: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1460: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1461: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1462: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1463: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1464: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1465: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1466: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1467: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1468: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1469: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1470: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1471: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1472: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1473: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1474: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1475: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1476: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1477: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1478: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1479: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1480: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1481: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1482: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1483: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1484: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1485: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1486: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1487: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1488: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1489: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1490: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1491: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1492: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1493: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1494: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1495: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1496: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1497: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1498: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1499: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1500: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1501: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1502: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1503: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1504: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1505: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1506: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1507: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1508: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1509: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1510: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1511: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %1 = bitcast <4 x i64> %__b to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1512: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1513: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1514: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1515: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1516: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1517: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1518: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1519: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <16 x i16> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <16 x i16> + %2 = icmp ult <16 x i16> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1520: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1521: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1522: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm3, %ymm3 +; NoVLX-NEXT: vpxor %ymm2, %ymm4, %ymm4 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpmovsxbd %xmm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp ult <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuw (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1523: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1524: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1525: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpxor 32(%rdi), %ymm1, %ymm3 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp ult <32 x i16> %0, %1 + %3 = shufflevector <32 x i1> %2, <32 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1526: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1527: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1528: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm8 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm1, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; NoVLX-NEXT: vpmovdb %zmm1, %xmm7 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm6 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm6, %ymm4, %ymm3 +; NoVLX-NEXT: vpxor %ymm6, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm4 +; NoVLX-NEXT: vpxor %ymm6, %ymm8, %ymm2 +; NoVLX-NEXT: vpxor %ymm6, %ymm5, %ymm3 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpand %xmm7, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm0, %xmm4, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %1 = bitcast <8 x i64> %__b to <32 x i16> + %2 = icmp ult <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuw (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1529: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1530: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1531: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm4 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm5 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm7 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm2 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm7, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm5, %ymm3, %ymm3 +; NoVLX-NEXT: vpxor (%rsi), %ymm5, %ymm6 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm6, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4 +; NoVLX-NEXT: vpxor 32(%rsi), %ymm5, %ymm5 +; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4 +; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4 +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <32 x i16> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <32 x i16> + %2 = icmp ult <32 x i16> %0, %1 + %3 = bitcast i32 %__u to <32 x i1> + %4 = and <32 x i1> %2, %3 + %5 = shufflevector <32 x i1> %4, <32 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1532: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1533: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1534: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1535: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1536: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1537: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1538: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1539: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1540: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1541: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1542: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1543: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1544: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1545: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1546: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1547: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1548: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1549: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1550: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1551: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1552: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1553: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1554: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1555: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1556: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1557: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1558: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %1 = bitcast <2 x i64> %__b to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1559: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1560: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1561: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1562: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1563: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1564: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1565: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1566: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1567: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp ult <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1568: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1569: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1570: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1571: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1572: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1573: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1574: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1575: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1576: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1577: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1578: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1579: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1580: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1581: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1582: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1583: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1584: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1585: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1586: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1587: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1588: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1589: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1590: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1591: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1592: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1593: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1594: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1595: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1596: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1597: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1598: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1599: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1600: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1601: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1602: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1603: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp ult <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1604: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1605: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1606: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1607: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1608: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1609: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1610: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1611: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1612: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1613: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1614: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1615: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1616: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1617: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1618: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1619: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1620: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1621: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1622: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1623: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1624: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1625: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1626: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1627: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1628: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1629: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1630: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1631: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1632: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1633: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1634: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1635: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1636: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1637: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1638: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1639: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1640: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1641: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1642: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1643: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1644: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1645: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1646: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1647: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1648: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1649: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1650: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1651: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1652: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1653: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1654: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1655: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1656: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1657: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1658: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1659: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1660: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1661: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1662: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1663: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1664: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1665: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1666: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1667: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1668: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1669: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1670: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1671: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1672: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1673: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1674: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1675: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1676: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1677: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1678: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1679: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1680: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1681: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1682: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1683: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1684: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1685: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1686: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1687: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1688: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1689: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1690: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1691: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1692: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1693: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1694: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1695: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1696: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1697: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1698: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1699: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %load = load i32, i32* %__b + %vec = insertelement <16 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> + %2 = icmp ult <16 x i32> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %3, %2 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + +define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> + %6 = bitcast <4 x i1> %5 to i4 + ret i4 %6 +} + + +define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1700: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1701: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1702: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1703: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1704: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1705: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1706: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1707: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1708: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1709: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1710: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1711: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1712: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1713: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1714: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1715: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1716: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1717: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1718: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1719: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1720: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1721: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1722: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1723: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1724: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1725: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1726: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %1 = bitcast <2 x i64> %__b to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1727: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1728: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1729: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x i64> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %2, %extract.i + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1730: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1731: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1732: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1733: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1734: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1735: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x i64> + %load = load i64, i64* %__b + %vec = insertelement <2 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> + %2 = icmp ult <2 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> + %4 = and <2 x i1> %extract.i, %2 + %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + +define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1736: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1737: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1738: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1739: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1740: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1741: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1742: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1743: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1744: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1745: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1746: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1747: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1748: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1749: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1750: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1751: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1752: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1753: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1754: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1755: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1756: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1757: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1758: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1759: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1760: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1761: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1762: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %1 = bitcast <4 x i64> %__b to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1763: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1764: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1765: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1766: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1767: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1768: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1769: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1770: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1771: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load i64, i64* %__b + %vec = insertelement <4 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> + %2 = icmp ult <4 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %extract.i, %2 + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + +define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> + %6 = bitcast <16 x i1> %5 to i16 + ret i16 %6 +} + + +define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1772: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1773: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1774: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1775: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1776: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1777: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1778: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1779: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1780: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + +define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1781: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1782: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1783: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1784: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1785: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1786: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1787: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1788: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1789: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1790: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1791: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1792: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1793: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1794: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1795: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1796: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1797: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1798: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %1 = bitcast <8 x i64> %__b to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + +define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1799: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1800: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1801: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x i64> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %2, %3 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1802: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1803: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1804: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp ult <8 x i64> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { +; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1805: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1806: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1807: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x i64> + %load = load i64, i64* %__b + %vec = insertelement <8 x i64> undef, i64 %load, i32 0 + %1 = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> + %2 = icmp ult <8 x i64> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> + %6 = bitcast <64 x i1> %5 to i64 + ret i64 %6 +} + + +declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32) +define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %1 = bitcast <2 x i64> %__b to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1 +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load float, float* %__b + %vec = insertelement <4 x float> undef, float %load, i32 0 + %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + + +define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %1 = bitcast <2 x i64> %__b to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1 +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load float, float* %__b + %vec = insertelement <4 x float> undef, float %load, i32 0 + %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + + +define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1808: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1809: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1810: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %1 = bitcast <2 x i64> %__b to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1811: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1812: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1813: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1814: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1815: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1816: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1 +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load float, float* %__b + %vec = insertelement <4 x float> undef, float %load, i32 0 + %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + + +define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1817: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1818: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1819: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %1 = bitcast <2 x i64> %__b to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1820: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1821: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1822: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x float> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1823: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1824: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1825: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1 +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x float> + %load = load float, float* %__b + %vec = insertelement <4 x float> undef, float %load, i32 0 + %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> + %2 = fcmp oeq <4 x float> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + + +define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %1 = bitcast <4 x i64> %__b to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovaps (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $8, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load float, float* %__b + %vec = insertelement <8 x float> undef, float %load, i32 0 + %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + + +define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1826: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1827: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1828: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %1 = bitcast <4 x i64> %__b to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1829: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1830: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1831: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovaps (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1832: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1833: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1834: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load float, float* %__b + %vec = insertelement <8 x float> undef, float %load, i32 0 + %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + + +define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1835: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1836: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1837: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %1 = bitcast <4 x i64> %__b to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1838: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1839: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1840: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovaps (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <8 x float> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1841: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1842: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1843: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x float> + %load = load float, float* %__b + %vec = insertelement <8 x float> undef, float %load, i32 0 + %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> + %2 = fcmp oeq <8 x float> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + + +define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1844: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1845: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1846: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1847: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1848: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1849: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1850: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1851: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %1 = bitcast <8 x i64> %__b to <16 x float> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1852: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1853: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1854: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1855: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1856: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1857: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1858: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1859: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x float> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1860: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1861: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1862: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: Lcfi1863: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1864: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1865: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1866: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1867: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %load = load float, float* %__b + %vec = insertelement <16 x float> undef, float %load, i32 0 + %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + + +define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %1 = bitcast <8 x i64> %__b to <16 x float> + %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8) + %3 = zext i16 %2 to i32 + ret i32 %3 +} + + +define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1868: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1869: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1870: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1871: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1872: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1873: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1874: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1875: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %1 = bitcast <8 x i64> %__b to <16 x float> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1876: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1877: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1878: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1879: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1880: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1881: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1882: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1883: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <16 x float> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1884: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1885: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1886: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: Lcfi1887: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: Lcfi1888: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: Lcfi1889: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: Lcfi1890: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: Lcfi1891: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %load = load float, float* %__b + %vec = insertelement <16 x float> undef, float %load, i32 0 + %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> + %2 = fcmp oeq <16 x float> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + + +define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: movzwl %ax, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzwl %ax, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x float> + %1 = bitcast <8 x i64> %__b to <16 x float> + %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8) + %3 = zext i16 %2 to i64 + ret i64 %3 +} + + +declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) +define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + +define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> + %4 = bitcast <4 x i1> %3 to i4 + ret i4 %4 +} + + +define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + + +define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + + +define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1892: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1893: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1894: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1895: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1896: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1897: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1898: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1899: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1900: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + + +define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1901: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1902: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1903: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %1 = bitcast <2 x i64> %__b to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1904: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1905: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1906: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <2 x double> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1907: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1908: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1909: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <2 x double> + %load = load double, double* %__b + %vec = insertelement <2 x double> undef, double %load, i32 0 + %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> + %2 = fcmp oeq <2 x double> %0, %1 + %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + + +define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %1 = bitcast <4 x i64> %__b to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load double, double* %__b + %vec = insertelement <4 x double> undef, double %load, i32 0 + %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + + +define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %1 = bitcast <4 x i64> %__b to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load double, double* %__b + %vec = insertelement <4 x double> undef, double %load, i32 0 + %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + + +define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1910: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1911: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1912: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %1 = bitcast <4 x i64> %__b to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1913: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1914: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1915: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1916: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1917: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1918: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load double, double* %__b + %vec = insertelement <4 x double> undef, double %load, i32 0 + %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + + +define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1919: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1920: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1921: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %1 = bitcast <4 x i64> %__b to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1922: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1923: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1924: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x double> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1925: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1926: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1927: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x double> + %load = load double, double* %__b + %vec = insertelement <4 x double> undef, double %load, i32 0 + %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> + %2 = fcmp oeq <4 x double> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + + +define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load double, double* %__b + %vec = insertelement <8 x double> undef, double %load, i32 0 + %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + + +define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8) + %3 = zext i8 %2 to i16 + ret i16 %3 +} + + +define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1928: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1929: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1930: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1931: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1932: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1933: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1934: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1935: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1936: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load double, double* %__b + %vec = insertelement <8 x double> undef, double %load, i32 0 + %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + + +define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8) + %3 = zext i8 %2 to i32 + ret i32 %3 +} + + +define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1937: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1938: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1939: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1940: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1941: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1942: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load <8 x i64>, <8 x i64>* %__b + %1 = bitcast <8 x i64> %load to <8 x double> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; CHECK-NEXT: kmovq %k0, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: Lcfi1943: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: Lcfi1944: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: Lcfi1945: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %load = load double, double* %__b + %vec = insertelement <8 x double> undef, double %load, i32 0 + %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> + %2 = fcmp oeq <8 x double> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + + +define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { +; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask: +; NoVLX: ## BB#0: ## %entry +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <8 x double> + %1 = bitcast <8 x i64> %__b to <8 x double> + %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8) + %3 = zext i8 %2 to i64 + ret i64 %3 +} + + Index: test/CodeGen/X86/compress_expand.ll =================================================================== --- test/CodeGen/X86/compress_expand.ll +++ test/CodeGen/X86/compress_expand.ll @@ -265,9 +265,7 @@ ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k0 -; SKX-NEXT: kshiftlb $6, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k1 +; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 ; SKX-NEXT: vexpandps (%rdi), %xmm0 {%k1} ; SKX-NEXT: retq ; @@ -295,9 +293,7 @@ ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k0 -; SKX-NEXT: kshiftlb $6, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k1 +; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 ; SKX-NEXT: vcompressps %xmm0, (%rdi) {%k1} ; SKX-NEXT: retq ; Index: test/CodeGen/X86/masked_memop.ll =================================================================== --- test/CodeGen/X86/masked_memop.ll +++ test/CodeGen/X86/masked_memop.ll @@ -462,9 +462,7 @@ ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0 -; SKX-NEXT: kshiftlw $14, %k0, %k0 -; SKX-NEXT: kshiftrw $14, %k0, %k1 +; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 ; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -550,9 +548,7 @@ ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0 -; SKX-NEXT: kshiftlw $14, %k0, %k0 -; SKX-NEXT: kshiftrw $14, %k0, %k1 +; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 ; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer @@ -601,9 +597,7 @@ ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0 -; SKX-NEXT: kshiftlw $14, %k0, %k0 -; SKX-NEXT: kshiftrw $14, %k0, %k1 +; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 @@ -645,9 +639,7 @@ ; SKX: ## BB#0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; SKX-NEXT: kshiftlw $14, %k0, %k0 -; SKX-NEXT: kshiftrw $14, %k0, %k1 +; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer