diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -96,6 +96,14 @@ * Functions with the probe-stack attribute set to "inline-asm" are now protected against stack clash without the need of a third-party probing function and with limited impact on performance. +* -x86-enable-old-knl-abi command line switch has been removed. v32i16/v64i8 + vectors are always passed in ZMM register when avx512f is enabled and avx512bw + is disabled. +* Vectors larger than 512 bits with i16 or i8 elements will be passed in + multiple ZMM registers when avx512f is enabled. Previously this required + avx512bw otherwise they would split into multiple YMM registers. This means + vXi16/vXi8 vectors are consistently treated the same as + vXi32/vXi64/vXf64/vXf32 vectors of the same total width. Changes to the AMDGPU Target ----------------------------- diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -862,6 +862,57 @@ } switch (N->getOpcode()) { + case X86ISD::VBROADCAST: { + MVT VT = N->getSimpleValueType(0); + // Emulate v32i16/v64i8 broadcast without BWI. + if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { + MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; + SDLoc dl(N); + SDValue NarrowBCast = + CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0)); + SDValue Res = + CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), + NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); + unsigned Index = VT == MVT::v32i16 ? 16 : 32; + Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast, + CurDAG->getIntPtrConstant(Index, dl)); + + --I; + CurDAG->ReplaceAllUsesWith(N, Res.getNode()); + ++I; + CurDAG->DeleteNode(N); + } + + break; + } + case X86ISD::VBROADCAST_LOAD: { + MVT VT = N->getSimpleValueType(0); + // Emulate v32i16/v64i8 broadcast without BWI. + if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { + MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; + auto *MemNode = cast(N); + SDLoc dl(N); + SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other); + SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()}; + SDValue NarrowBCast = CurDAG->getMemIntrinsicNode( + X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(), + MemNode->getMemOperand()); + SDValue Res = + CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), + NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); + unsigned Index = VT == MVT::v32i16 ? 16 : 32; + Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast, + CurDAG->getIntPtrConstant(Index, dl)); + + --I; + SDValue To[] = {Res, NarrowBCast.getValue(1)}; + CurDAG->ReplaceAllUsesWith(N, To); + ++I; + CurDAG->DeleteNode(N); + } + + break; + } case ISD::VSELECT: { // Replace VSELECT with non-mask conditions with with BLENDV. if (N->getOperand(0).getValueType().getVectorElementType() == MVT::i1) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -75,13 +75,6 @@ " of the loop header PC will be 0)."), cl::Hidden); -// Added in 10.0. -static cl::opt EnableOldKNLABI( - "x86-enable-old-knl-abi", cl::init(false), - cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of " - "one ZMM register on AVX512F, but not AVX512BW targets."), - cl::Hidden); - static cl::opt MulConstantOptimization( "mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " @@ -1457,10 +1450,14 @@ // elements. 512-bits can be disabled based on prefer-vector-width and // required-vector-width function attributes. if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) { + bool HasBWI = Subtarget.hasBWI(); + addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); + addRegisterClass(MVT::v32i16, &X86::VR512RegClass); + addRegisterClass(MVT::v64i8, &X86::VR512RegClass); for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); @@ -1525,18 +1522,17 @@ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); - // Need to custom widen this if we don't have AVX512BW. - setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); - for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); @@ -1550,48 +1546,68 @@ setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::FROUND, VT, Custom); - - setOperationAction(ISD::SELECT, VT, Custom); } - // Without BWI we need to use custom lowering to handle MVT::v64i8 input. - for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) { + for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); } - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); + setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom); + setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom); + setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom); + setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom); + + setOperationAction(ISD::MUL, MVT::v8i64, Custom); + setOperationAction(ISD::MUL, MVT::v16i32, Legal); + setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom); + setOperationAction(ISD::MUL, MVT::v64i8, Custom); + + setOperationAction(ISD::MULHU, MVT::v16i32, Custom); + setOperationAction(ISD::MULHS, MVT::v16i32, Custom); + setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom); + setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom); + setOperationAction(ISD::MULHS, MVT::v64i8, Custom); + setOperationAction(ISD::MULHU, MVT::v64i8, Custom); - setOperationAction(ISD::MUL, MVT::v8i64, Custom); - setOperationAction(ISD::MUL, MVT::v16i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); - setOperationAction(ISD::MULHU, MVT::v16i32, Custom); - setOperationAction(ISD::MULHS, MVT::v16i32, Custom); + for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) { + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + // The condition codes aren't legal in SSE/AVX and under AVX512 we use + // setcc all the way to isel and prefer SETGT in some isel patterns. + setCondCodeAction(ISD::SETLT, VT, Custom); + setCondCodeAction(ISD::SETLE, VT, Custom); + } for (auto VT : { MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); - setOperationAction(ISD::SRL, VT, Custom); - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); - setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); - setOperationAction(ISD::SELECT, VT, Custom); + } - // The condition codes aren't legal in SSE/AVX and under AVX512 we use - // setcc all the way to isel and prefer SETGT in some isel patterns. - setCondCodeAction(ISD::SETLT, VT, Custom); - setCondCodeAction(ISD::SETLE, VT, Custom); + for (auto VT : { MVT::v64i8, MVT::v32i16 }) { + setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom); + setOperationAction(ISD::CTLZ, VT, Custom); + setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom); } if (Subtarget.hasDQI()) { @@ -1626,27 +1642,28 @@ MVT::v8f32, MVT::v4f64 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); + for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, + MVT::v16f32, MVT::v8f64 }) { + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + } + for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } if (!Subtarget.hasBWI()) { - // Need to custom split v32i16/v64i8 bitcasts. - setOperationAction(ISD::BITCAST, MVT::v32i16, Custom); - setOperationAction(ISD::BITCAST, MVT::v64i8, Custom); - - // Better to split these into two 256-bit ops. - setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom); - setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom); + setOperationAction(ISD::STORE, MVT::v32i16, Custom); + setOperationAction(ISD::STORE, MVT::v64i8, Custom); } if (Subtarget.hasVBMI2()) { @@ -1781,80 +1798,22 @@ // disabled based on prefer-vector-width and required-vector-width function // attributes. if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) { - addRegisterClass(MVT::v32i16, &X86::VR512RegClass); - addRegisterClass(MVT::v64i8, &X86::VR512RegClass); - // Extends from v64i1 masks to 512-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom); - setOperationAction(ISD::MUL, MVT::v32i16, Legal); - setOperationAction(ISD::MUL, MVT::v64i8, Custom); - setOperationAction(ISD::MULHS, MVT::v32i16, Legal); - setOperationAction(ISD::MULHU, MVT::v32i16, Legal); - setOperationAction(ISD::MULHS, MVT::v64i8, Custom); - setOperationAction(ISD::MULHU, MVT::v64i8, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); - setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); - - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); - setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); - setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); for (auto VT : { MVT::v64i8, MVT::v32i16 }) { - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Custom); - setOperationAction(ISD::ABS, VT, Legal); - setOperationAction(ISD::SRL, VT, Custom); - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); - setOperationAction(ISD::CTPOP, VT, Custom); - setOperationAction(ISD::CTLZ, VT, Custom); - setOperationAction(ISD::SMAX, VT, Legal); - setOperationAction(ISD::UMAX, VT, Legal); - setOperationAction(ISD::SMIN, VT, Legal); - setOperationAction(ISD::UMIN, VT, Legal); - setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::UADDSAT, VT, Legal); - setOperationAction(ISD::SADDSAT, VT, Legal); - setOperationAction(ISD::USUBSAT, VT, Legal); - setOperationAction(ISD::SSUBSAT, VT, Legal); - setOperationAction(ISD::SELECT, VT, Custom); - - // The condition codes aren't legal in SSE/AVX and under AVX512 we use - // setcc all the way to isel and prefer SETGT in some isel patterns. - setCondCodeAction(ISD::SETLT, VT, Custom); - setCondCodeAction(ISD::SETLE, VT, Custom); } for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); } - if (Subtarget.hasBITALG()) { - for (auto VT : { MVT::v64i8, MVT::v32i16 }) - setOperationAction(ISD::CTPOP, VT, Legal); - } - if (Subtarget.hasVBMI2()) { setOperationAction(ISD::FSHL, MVT::v32i16, Custom); setOperationAction(ISD::FSHR, MVT::v32i16, Custom); @@ -2097,7 +2056,8 @@ TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(MVT VT) const { - if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) + if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() && + !Subtarget.hasBWI()) return TypeSplitVector; if (VT.getVectorNumElements() != 1 && @@ -2156,11 +2116,6 @@ return RegisterVT; } - // FIXME: Should we just make these types legal and custom split operations? - if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI && - Subtarget.useAVX512Regs() && !Subtarget.hasBWI()) - return MVT::v16i32; - return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } @@ -2179,11 +2134,6 @@ return NumRegisters; } - // FIXME: Should we just make these types legal and custom split operations? - if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI && - Subtarget.useAVX512Regs() && !Subtarget.hasBWI()) - return 1; - return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } @@ -17506,6 +17456,9 @@ Subtarget, DAG)) return Broadcast; + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); + // Dispatch to each element type for lowering. If we don't have support for // specific element type shuffles at 512 bits, immediately split them and // lower them. Each lowering routine of a given type is allowed to assume that @@ -17988,6 +17941,10 @@ unsigned EltSize = VT.getScalarSizeInBits(); unsigned NumElts = VT.getVectorNumElements(); + // Expand v32i16/v64i8 without BWI. + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) + return SDValue(); + // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition // into an i1 condition so that we can use the mask-based 512-bit blend // instructions. @@ -20139,14 +20096,9 @@ unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc); - // Custom legalize v8i8->v8i64 on CPUs without avx512bw. - if (InVT == MVT::v8i8) { - if (VT != MVT::v8i64) - return SDValue(); - - In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), - MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8)); - return DAG.getNode(ExtendInVecOpc, dl, VT, In); + if (VT == MVT::v32i16 && !Subtarget.hasBWI()) { + assert(InVT == MVT::v32i8 && "Unexpected VT!"); + return splitVectorIntUnary(Op, DAG); } if (Subtarget.hasInt256()) @@ -20491,6 +20443,11 @@ // vpmovqb/w/d, vpmovdb/w, vpmovwb if (Subtarget.hasAVX512()) { + if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) { + assert(VT == MVT::v32i8 && "Unexpected VT!"); + return splitVectorIntUnary(Op, DAG); + } + // word to byte only under BWI. Otherwise we have to promoted to v16i32 // and then truncate that. But we should only do that if we haven't been // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be @@ -22252,6 +22209,9 @@ if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitIntVSETCC(Op, DAG); + if (VT == MVT::v32i16 || VT == MVT::v64i8) + return splitIntVSETCC(Op, DAG); + // If this is a SETNE against the signed minimum value, change it to SETGT. // If this is a SETNE against the signed maximum value, change it to SETLT. // which will be swapped to SETGT. @@ -23224,14 +23184,9 @@ InVT.getVectorElementType() == MVT::i32) && "Unexpected element type"); - // Custom legalize v8i8->v8i64 on CPUs without avx512bw. - if (InVT == MVT::v8i8) { - if (VT != MVT::v8i64) - return SDValue(); - - In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), - MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8)); - return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In); + if (VT == MVT::v32i16 && !Subtarget.hasBWI()) { + assert(InVT == MVT::v32i8 && "Unexpected VT!"); + return splitVectorIntUnary(Op, DAG); } if (Subtarget.hasInt256()) @@ -23365,7 +23320,9 @@ // and each half can execute independently. Some cores would split the op into // halves anyway, so the concat (vinsertf128) is purely an extra op. MVT StoreVT = StoredVal.getSimpleValueType(); - if (StoreVT.is256BitVector()) { + if (StoreVT.is256BitVector() || + ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) && + !Subtarget.hasBWI())) { SmallVector CatOps; if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps)) return splitVectorStore(St, DAG); @@ -26195,6 +26152,9 @@ return DAG.getNode(ISD::XOR, SDLoc(Op), VT, Op.getOperand(0), Op.getOperand(1)); + if (VT == MVT::v32i16 || VT == MVT::v64i8) + return splitVectorIntBinary(Op, DAG); + assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); @@ -26243,6 +26203,9 @@ return SDValue(); } + if (VT == MVT::v32i16 || VT == MVT::v64i8) + return splitVectorIntBinary(Op, DAG); + assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); @@ -26279,6 +26242,9 @@ return splitVectorIntUnary(Op, DAG); } + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) + return splitVectorIntUnary(Op, DAG); + // Default to expand. return SDValue(); } @@ -26290,6 +26256,9 @@ if (VT.getScalarType() != MVT::i64 && VT.is256BitVector()) return splitVectorIntBinary(Op, DAG); + if (VT == MVT::v32i16 || VT == MVT::v64i8) + return splitVectorIntBinary(Op, DAG); + SDLoc DL(Op); unsigned Opcode = Op.getOpcode(); SDValue N0 = Op.getOperand(0); @@ -26334,6 +26303,9 @@ if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitVectorIntBinary(Op, DAG); + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) + return splitVectorIntBinary(Op, DAG); + SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); @@ -26480,6 +26452,9 @@ if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitVectorIntBinary(Op, DAG); + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) + return splitVectorIntBinary(Op, DAG); + if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) { assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || @@ -26861,7 +26836,7 @@ return ArithmeticShiftRight64(ShiftAmt); if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) || - VT == MVT::v64i8) { + (Subtarget.hasBWI() && VT == MVT::v64i8)) { unsigned NumElts = VT.getVectorNumElements(); MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); @@ -27307,8 +27282,8 @@ // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI. if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) && - (VT == MVT::v16i8 || VT == MVT::v64i8 || - (VT == MVT::v32i8 && Subtarget.hasInt256())) && + (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || + (VT == MVT::v64i8 && Subtarget.hasBWI())) && !Subtarget.hasXOP()) { int NumElts = VT.getVectorNumElements(); SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8); @@ -27546,6 +27521,9 @@ if (VT.is256BitVector()) return splitVectorIntBinary(Op, DAG); + if (VT == MVT::v32i16 || VT == MVT::v64i8) + return splitVectorIntBinary(Op, DAG); + return SDValue(); } @@ -28076,18 +28054,6 @@ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); } - // Custom splitting for BWI types when AVX512F is available but BWI isn't. - if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() && - DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) { - SDLoc dl(Op); - SDValue Lo, Hi; - std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); - MVT CastVT = DstVT.getHalfNumVectorElementsVT(); - Lo = DAG.getBitcast(CastVT, Lo); - Hi = DAG.getBitcast(CastVT, Hi); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi); - } - // Use MOVMSK for vector to scalar conversion to prevent scalarization. if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) { assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512"); @@ -28356,12 +28322,9 @@ SDValue In = Op.getOperand(0); SDLoc DL(Op); - // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB - // lowering. - if (VT == MVT::v8i64 || VT == MVT::v16i32) { - assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE"); + // Split v64i8 without BWI so that we can still use the PSHUFB lowering. + if (VT == MVT::v64i8 && !Subtarget.hasBWI()) return splitVectorIntUnary(Op, DAG); - } unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarType() == MVT::i8 && @@ -30005,19 +29968,6 @@ return; } - // Custom splitting for BWI types when AVX512F is available but BWI isn't. - if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) && - SrcVT.isVector() && isTypeLegal(SrcVT)) { - SDValue Lo, Hi; - std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); - MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8; - Lo = DAG.getBitcast(CastVT, Lo); - Hi = DAG.getBitcast(CastVT, Hi); - SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi); - Results.push_back(Res); - return; - } - if (DstVT.isVector() && SrcVT == MVT::x86mmx) { assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && "Unexpected type action!"); @@ -40532,10 +40482,14 @@ if (!VT.isVector() || VT.getVectorElementType() != MVT::i32) return SDValue(); - // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case. - // Also allow v2i32 if it will be widened. + // Make sure the type is legal or will be widened to a legal type. + if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements()); - if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT)) + + // Without BWI, we would need to split v32i16. + if (WVT == MVT::v32i16 && !Subtarget.hasBWI()) return SDValue(); SDValue N0 = N->getOperand(0); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -304,6 +304,10 @@ { ISD::SRA, MVT::v2i64, 1 }, { ISD::SRA, MVT::v4i64, 1 }, { ISD::SRA, MVT::v8i64, 1 }, + + { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand. + { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand. + { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb. }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && @@ -370,6 +374,14 @@ { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence + { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence + { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence + { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence + { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence + { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence + { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence + { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence + { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence }; if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || @@ -446,11 +458,32 @@ return LT.first * Entry->Cost; } + static const CostTblEntry AVX512BWShiftCostTable[] = { + { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw + { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw + { ISD::SRA, MVT::v8i16, 1 }, // vpsravw + + { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw + { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw + { ISD::SRA, MVT::v16i16, 1 }, // vpsravw + + { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw + { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw + { ISD::SRA, MVT::v32i16, 1 }, // vpsravw + }; + + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + static const CostTblEntry AVX2UniformCostTable[] = { // Uniform splats are cheaper for the following instructions. { ISD::SHL, MVT::v16i16, 1 }, // psllw. { ISD::SRL, MVT::v16i16, 1 }, // psrlw. { ISD::SRA, MVT::v16i16, 1 }, // psraw. + { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw. + { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw. + { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw. }; if (ST->hasAVX2() && @@ -495,18 +528,6 @@ return LT.first * Entry->Cost; static const CostTblEntry AVX512BWCostTable[] = { - { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw - { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw - { ISD::SRA, MVT::v8i16, 1 }, // vpsravw - - { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw - { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw - { ISD::SRA, MVT::v16i16, 1 }, // vpsravw - - { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw - { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw - { ISD::SRA, MVT::v32i16, 1 }, // vpsravw - { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. @@ -533,6 +554,7 @@ { ISD::SRA, MVT::v4i64, 1 }, { ISD::SRA, MVT::v8i64, 1 }, + { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org) @@ -568,6 +590,17 @@ { ISD::SRL, MVT::v4i64, 1 }, }; + if (ST->hasAVX512()) { + if (ISD == ISD::SHL && LT.second == MVT::v32i16 && + (Op2Info == TargetTransformInfo::OK_UniformConstantValue || + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) + // On AVX512, a packed v32i16 shift left by a constant build_vector + // is lowered into a vector multiply (vpmullw). + return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + } + // Look for AVX2 lowering tricks. if (ST->hasAVX2()) { if (ISD == ISD::SHL && LT.second == MVT::v16i16 && @@ -667,13 +700,19 @@ static const CostTblEntry AVX2CostTable[] = { { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. + { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence. { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. + { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence. { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. + { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence. { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. + { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence. { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. + { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence. { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. + { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence. { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. @@ -1070,6 +1109,8 @@ {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd + {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps @@ -1101,7 +1142,14 @@ {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q - {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d + {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d + + // FIXME: This just applies the type legalization cost rules above + // assuming these completely split. + {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14}, + {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14}, + {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42}, + {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42}, }; if (ST->hasAVX512()) @@ -1358,6 +1406,8 @@ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7 },// 2*vpmovqd+concat+vpmovdb + { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 9 }, // FIXME + // v16i1 -> v16i32 - load + broadcast { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, @@ -1372,6 +1422,9 @@ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right + { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, @@ -1843,6 +1896,12 @@ { ISD::SELECT, MVT::v16i32, 1 }, { ISD::SELECT, MVT::v8f64, 1 }, { ISD::SELECT, MVT::v16f32, 1 }, + + { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4 + { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4 + + { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3 + { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3 }; static const CostTblEntry AVX2CostTbl[] = { @@ -2005,12 +2064,20 @@ static const CostTblEntry AVX512CostTbl[] = { { ISD::BITREVERSE, MVT::v8i64, 36 }, { ISD::BITREVERSE, MVT::v16i32, 24 }, + { ISD::BITREVERSE, MVT::v32i16, 10 }, + { ISD::BITREVERSE, MVT::v64i8, 10 }, { ISD::CTLZ, MVT::v8i64, 29 }, { ISD::CTLZ, MVT::v16i32, 35 }, + { ISD::CTLZ, MVT::v32i16, 28 }, + { ISD::CTLZ, MVT::v64i8, 18 }, { ISD::CTPOP, MVT::v8i64, 16 }, { ISD::CTPOP, MVT::v16i32, 24 }, + { ISD::CTPOP, MVT::v32i16, 18 }, + { ISD::CTPOP, MVT::v64i8, 12 }, { ISD::CTTZ, MVT::v8i64, 20 }, { ISD::CTTZ, MVT::v16i32, 28 }, + { ISD::CTTZ, MVT::v32i16, 24 }, + { ISD::CTTZ, MVT::v64i8, 18 }, { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq @@ -2019,6 +2086,14 @@ { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq + { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split + { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split + { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split + { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split + { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split + { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split + { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split + { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split { ISD::FMAXNUM, MVT::f32, 2 }, { ISD::FMAXNUM, MVT::v4f32, 2 }, { ISD::FMAXNUM, MVT::v8f32, 2 }, diff --git a/llvm/test/Analysis/CostModel/X86/arith-fix.ll b/llvm/test/Analysis/CostModel/X86/arith-fix.ll --- a/llvm/test/Analysis/CostModel/X86/arith-fix.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-fix.ll @@ -120,11 +120,11 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'smul' @@ -158,11 +158,11 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'smul' @@ -354,11 +354,11 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'umul' @@ -392,11 +392,11 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'umul' diff --git a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll --- a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll @@ -1076,7 +1076,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) @@ -1114,7 +1114,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) @@ -1314,7 +1314,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) @@ -1352,7 +1352,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/arith.ll b/llvm/test/Analysis/CostModel/X86/arith.ll --- a/llvm/test/Analysis/CostModel/X86/arith.ll +++ b/llvm/test/Analysis/CostModel/X86/arith.ll @@ -507,11 +507,11 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = or i16 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = or <8 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = or <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = or <32 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = or <32 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = or i8 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = or <16 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = or <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = or <64 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = or <64 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I1 = or i1 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I1 = or <2 x i1> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I1 = or <4 x i1> undef, undef @@ -559,11 +559,11 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = or i16 undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = or <8 x i16> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = or <16 x i16> undef, undef -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = or <32 x i16> undef, undef +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = or <32 x i16> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = or i8 undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = or <16 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = or <32 x i8> undef, undef -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = or <64 x i8> undef, undef +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = or <64 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I1 = or i1 undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I1 = or <2 x i1> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I1 = or <4 x i1> undef, undef @@ -669,11 +669,11 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = xor i16 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = xor <8 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = xor <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = xor <32 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = xor <32 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = xor i8 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = xor <16 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = xor <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = xor <64 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = xor <64 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I1 = xor i1 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I1 = xor <2 x i1> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I1 = xor <4 x i1> undef, undef @@ -721,11 +721,11 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = xor i16 undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = xor <8 x i16> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = xor <16 x i16> undef, undef -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = xor <32 x i16> undef, undef +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = xor <32 x i16> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = xor i8 undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = xor <16 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = xor <32 x i8> undef, undef -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = xor <64 x i8> undef, undef +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = xor <64 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I1 = xor i1 undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I1 = xor <2 x i1> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I1 = xor <4 x i1> undef, undef @@ -831,11 +831,11 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = and i16 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = and <8 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = and <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = and <32 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = and <32 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = and i8 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = and <16 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = and <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = and <64 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = and <64 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I1 = and i1 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I1 = and <2 x i1> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I1 = and <4 x i1> undef, undef @@ -883,11 +883,11 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = and i16 undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = and <8 x i16> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = and <16 x i16> undef, undef -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = and <32 x i16> undef, undef +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = and <32 x i16> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = and i8 undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = and <16 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = and <32 x i8> undef, undef -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = and <64 x i8> undef, undef +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = and <64 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I1 = and i1 undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I1 = and <2 x i1> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I1 = and <4 x i1> undef, undef diff --git a/llvm/test/Analysis/CostModel/X86/fshl.ll b/llvm/test/Analysis/CostModel/X86/fshl.ll --- a/llvm/test/Analysis/CostModel/X86/fshl.ll +++ b/llvm/test/Analysis/CostModel/X86/fshl.ll @@ -194,7 +194,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'var_funnel_i16' @@ -208,7 +208,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'var_funnel_i16' @@ -279,7 +279,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'var_funnel_i8' @@ -293,7 +293,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'var_funnel_i8' @@ -560,7 +560,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatvar_funnel_i16' @@ -578,7 +578,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatvar_funnel_i16' @@ -669,7 +669,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatvar_funnel_i8' @@ -687,7 +687,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatvar_funnel_i8' @@ -914,7 +914,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'constant_funnel_i16' @@ -928,7 +928,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'constant_funnel_i16' @@ -999,7 +999,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'constant_funnel_i8' @@ -1013,7 +1013,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'constant_funnel_i8' @@ -1230,7 +1230,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatconstant_funnel_i16' @@ -1244,7 +1244,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_funnel_i16' @@ -1315,7 +1315,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatconstant_funnel_i8' @@ -1329,7 +1329,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_funnel_i8' @@ -1539,7 +1539,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'var_rotate_i16' @@ -1553,7 +1553,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'var_rotate_i16' @@ -1624,7 +1624,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'var_rotate_i8' @@ -1638,7 +1638,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'var_rotate_i8' @@ -1878,7 +1878,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatvar_rotate_i16' @@ -1896,7 +1896,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatvar_rotate_i16' @@ -1987,7 +1987,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatvar_rotate_i8' @@ -2005,7 +2005,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatvar_rotate_i8' @@ -2225,7 +2225,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'constant_rotate_i16' @@ -2239,7 +2239,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'constant_rotate_i16' @@ -2310,7 +2310,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'constant_rotate_i8' @@ -2324,7 +2324,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'constant_rotate_i8' @@ -2520,7 +2520,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatconstant_rotate_i16' @@ -2534,7 +2534,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_rotate_i16' @@ -2598,7 +2598,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatconstant_rotate_i8' @@ -2612,7 +2612,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_rotate_i8' diff --git a/llvm/test/Analysis/CostModel/X86/fshr.ll b/llvm/test/Analysis/CostModel/X86/fshr.ll --- a/llvm/test/Analysis/CostModel/X86/fshr.ll +++ b/llvm/test/Analysis/CostModel/X86/fshr.ll @@ -194,7 +194,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'var_funnel_i16' @@ -208,7 +208,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'var_funnel_i16' @@ -279,7 +279,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'var_funnel_i8' @@ -293,7 +293,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'var_funnel_i8' @@ -560,7 +560,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatvar_funnel_i16' @@ -578,7 +578,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatvar_funnel_i16' @@ -669,7 +669,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatvar_funnel_i8' @@ -687,7 +687,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatvar_funnel_i8' @@ -914,7 +914,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'constant_funnel_i16' @@ -928,7 +928,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'constant_funnel_i16' @@ -999,7 +999,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'constant_funnel_i8' @@ -1013,7 +1013,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'constant_funnel_i8' @@ -1230,7 +1230,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatconstant_funnel_i16' @@ -1244,7 +1244,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_funnel_i16' @@ -1315,7 +1315,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatconstant_funnel_i8' @@ -1329,7 +1329,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_funnel_i8' @@ -1540,7 +1540,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'var_rotate_i16' @@ -1554,7 +1554,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'var_rotate_i16' @@ -1625,7 +1625,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'var_rotate_i8' @@ -1639,7 +1639,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'var_rotate_i8' @@ -1879,7 +1879,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatvar_rotate_i16' @@ -1897,7 +1897,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatvar_rotate_i16' @@ -1988,7 +1988,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatvar_rotate_i8' @@ -2006,7 +2006,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatvar_rotate_i8' @@ -2226,7 +2226,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'constant_rotate_i16' @@ -2240,7 +2240,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'constant_rotate_i16' @@ -2311,7 +2311,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'constant_rotate_i8' @@ -2325,7 +2325,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'constant_rotate_i8' @@ -2521,7 +2521,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatconstant_rotate_i16' @@ -2535,7 +2535,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_rotate_i16' @@ -2599,7 +2599,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'splatconstant_rotate_i8' @@ -2613,7 +2613,7 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_rotate_i8' diff --git a/llvm/test/Analysis/CostModel/X86/icmp.ll b/llvm/test/Analysis/CostModel/X86/icmp.ll --- a/llvm/test/Analysis/CostModel/X86/icmp.ll +++ b/llvm/test/Analysis/CostModel/X86/icmp.ll @@ -508,13 +508,13 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef @@ -835,13 +835,13 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef @@ -1162,13 +1162,13 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef @@ -1816,13 +1816,13 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef @@ -2143,13 +2143,13 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef @@ -2470,13 +2470,13 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef @@ -3124,13 +3124,13 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll --- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -107,11 +107,11 @@ ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef) @@ -276,11 +276,11 @@ ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 168 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 352 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef) @@ -499,11 +499,11 @@ ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) @@ -668,11 +668,11 @@ ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 88 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef) @@ -820,59 +820,32 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; -; KNL-LABEL: 'masked_expandload' -; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 -; -; SKX-LABEL: 'masked_expandload' -; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; AVX512-LABEL: 'masked_expandload' +; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(double* undef, <2 x i1> undef, <2 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(double* undef, <1 x i1> undef, <1 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(float* undef, <16 x i1> undef, <16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(float* undef, <8 x i1> undef, <8 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(float* undef, <4 x i1> undef, <4 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(float* undef, <2 x i1> undef, <2 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(i64* undef, <8 x i1> undef, <8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(i64* undef, <4 x i1> undef, <4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(i64* undef, <2 x i1> undef, <2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(i64* undef, <1 x i1> undef, <1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(i32* undef, <16 x i1> undef, <16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(i32* undef, <8 x i1> undef, <8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(i32* undef, <4 x i1> undef, <4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(i32* undef, <2 x i1> undef, <2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(i16* undef, <32 x i1> undef, <32 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(i16* undef, <16 x i1> undef, <16 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(i16* undef, <8 x i1> undef, <8 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(i16* undef, <4 x i1> undef, <4 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(double* undef, <8 x i1> undef, <8 x double> undef) %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(double* undef, <4 x i1> undef, <4 x double> undef) @@ -989,59 +962,32 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; -; KNL-LABEL: 'masked_compressstore' -; KNL-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 223 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 -; -; SKX-LABEL: 'masked_compressstore' -; SKX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 119 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 239 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; AVX512-LABEL: 'masked_compressstore' +; AVX512-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, double* undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, double* undef, <1 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, float* undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, float* undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, float* undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, float* undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, i64* undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, i64* undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, i64* undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, i64* undef, <1 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, i32* undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, i32* undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, i32* undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, i32* undef, <2 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 119 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, i16* undef, <32 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, i16* undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, i16* undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, i16* undef, <4 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 239 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, i8* undef, <64 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, i8* undef, <32 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, i8* undef, <16 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, i8* undef, <8 x i1> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; call void @llvm.masked.compressstore.v8f64(<8 x double> undef, double* undef, <8 x i1> undef) call void @llvm.masked.compressstore.v4f64(<4 x double> undef, double* undef, <4 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-add.ll b/llvm/test/Analysis/CostModel/X86/reduce-add.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-add.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-add.ll @@ -141,8 +141,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' @@ -159,8 +159,8 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'reduce_i16' @@ -218,8 +218,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' @@ -238,8 +238,8 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'reduce_i8' diff --git a/llvm/test/Analysis/CostModel/X86/reduce-and.ll b/llvm/test/Analysis/CostModel/X86/reduce-and.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-and.ll @@ -93,32 +93,14 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'reduce_i16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) @@ -150,35 +132,15 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'reduce_i8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll @@ -155,8 +155,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' @@ -173,8 +173,8 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) @@ -223,8 +223,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' @@ -243,8 +243,8 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-or.ll b/llvm/test/Analysis/CostModel/X86/reduce-or.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-or.ll @@ -93,32 +93,14 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'reduce_i16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) @@ -150,35 +132,15 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'reduce_i8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-smax.ll b/llvm/test/Analysis/CostModel/X86/reduce-smax.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-smax.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-smax.ll @@ -182,8 +182,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' @@ -200,8 +200,8 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) @@ -270,8 +270,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' @@ -290,8 +290,8 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-smin.ll b/llvm/test/Analysis/CostModel/X86/reduce-smin.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-smin.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-smin.ll @@ -182,8 +182,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' @@ -200,8 +200,8 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) @@ -270,8 +270,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' @@ -290,8 +290,8 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-umax.ll b/llvm/test/Analysis/CostModel/X86/reduce-umax.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-umax.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-umax.ll @@ -182,8 +182,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' @@ -200,8 +200,8 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) @@ -270,8 +270,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' @@ -290,8 +290,8 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-umin.ll b/llvm/test/Analysis/CostModel/X86/reduce-umin.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-umin.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-umin.ll @@ -182,8 +182,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' @@ -200,8 +200,8 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) @@ -270,8 +270,8 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' @@ -290,8 +290,8 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll @@ -93,32 +93,14 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'reduce_i16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) @@ -150,35 +132,15 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'reduce_i8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/rem.ll b/llvm/test/Analysis/CostModel/X86/rem.ll --- a/llvm/test/Analysis/CostModel/X86/rem.ll +++ b/llvm/test/Analysis/CostModel/X86/rem.ll @@ -1054,43 +1054,24 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = urem <64 x i8> undef, ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'urem_constpow2' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 16 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = urem <2 x i64> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = urem <4 x i64> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = urem <8 x i64> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 16 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = urem <4 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = urem <8 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = urem <16 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 16 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = urem <8 x i16> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = urem <16 x i16> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i16 = urem <32 x i16> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 16 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = urem <16 x i8> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = urem <32 x i8> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = urem <64 x i8> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512BW-LABEL: 'urem_constpow2' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 16 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = urem <2 x i64> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = urem <4 x i64> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = urem <8 x i64> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 16 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = urem <4 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = urem <8 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = urem <16 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 16 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = urem <8 x i16> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = urem <16 x i16> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = urem <32 x i16> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 16 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = urem <16 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = urem <32 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = urem <64 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'urem_constpow2' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = urem <2 x i64> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = urem <4 x i64> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = urem <8 x i64> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = urem <4 x i32> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = urem <8 x i32> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = urem <16 x i32> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = urem <8 x i16> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = urem <16 x i16> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = urem <32 x i16> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = urem <16 x i8> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = urem <32 x i8> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = urem <64 x i8> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = urem i64 undef, 16 %V2i64 = urem <2 x i64> undef, @@ -1368,43 +1349,24 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = urem <64 x i8> undef, ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'urem_uniformconstpow2' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 16 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = urem <2 x i64> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = urem <4 x i64> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = urem <8 x i64> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 16 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = urem <4 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = urem <8 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = urem <16 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 16 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = urem <8 x i16> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = urem <16 x i16> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i16 = urem <32 x i16> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 16 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = urem <16 x i8> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = urem <32 x i8> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64i8 = urem <64 x i8> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512BW-LABEL: 'urem_uniformconstpow2' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 16 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = urem <2 x i64> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = urem <4 x i64> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = urem <8 x i64> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 16 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = urem <4 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = urem <8 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = urem <16 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 16 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = urem <8 x i16> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = urem <16 x i16> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = urem <32 x i16> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 16 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = urem <16 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = urem <32 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = urem <64 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'urem_uniformconstpow2' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = urem <2 x i64> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = urem <4 x i64> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = urem <8 x i64> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = urem <4 x i32> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = urem <8 x i32> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = urem <16 x i32> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = urem <8 x i16> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = urem <16 x i16> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = urem <32 x i16> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = urem <16 x i8> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = urem <32 x i8> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = urem <64 x i8> undef, +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = urem i64 undef, 16 %V2i64 = urem <2 x i64> undef, diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll @@ -397,123 +397,64 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; -; AVX512F-LABEL: 'test_vXi16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; AVX512BW-LABEL: 'test_vXi16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-LABEL: 'test_vXi16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'test_vXi16' ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> @@ -1050,243 +991,124 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; -; AVX512F-LABEL: 'test_vXi8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; AVX512BW-LABEL: 'test_vXi8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; AVX512-LABEL: 'test_vXi8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'test_vXi8' ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-reverse.ll b/llvm/test/Analysis/CostModel/X86/shuffle-reverse.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-reverse.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-reverse.ll @@ -222,7 +222,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'test_vXi16' @@ -309,7 +309,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'test_vXi8' diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll @@ -264,7 +264,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'test_vXi16' diff --git a/llvm/test/Analysis/CostModel/X86/trunc.ll b/llvm/test/Analysis/CostModel/X86/trunc.ll --- a/llvm/test/Analysis/CostModel/X86/trunc.ll +++ b/llvm/test/Analysis/CostModel/X86/trunc.ll @@ -120,31 +120,18 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'trunc_vXi16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512BW-LABEL: 'trunc_vXi16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16> -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'trunc_vXi16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16> +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'trunc_vXi16' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i16> @@ -285,13 +272,13 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i8> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i8> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i8> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i8> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i8> diff --git a/llvm/test/Analysis/CostModel/X86/vector-extract.ll b/llvm/test/Analysis/CostModel/X86/vector-extract.ll --- a/llvm/test/Analysis/CostModel/X86/vector-extract.ll +++ b/llvm/test/Analysis/CostModel/X86/vector-extract.ll @@ -586,55 +586,30 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'extract_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_a = extractelement <2 x i16> undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = extractelement <2 x i16> undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = extractelement <2 x i16> undef, i32 1 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_a = extractelement <4 x i16> undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = extractelement <4 x i16> undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = extractelement <4 x i16> undef, i32 3 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512BW-LABEL: 'extract_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_a = extractelement <2 x i16> undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = extractelement <2 x i16> undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = extractelement <2 x i16> undef, i32 1 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_a = extractelement <4 x i16> undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = extractelement <4 x i16> undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = extractelement <4 x i16> undef, i32 3 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'extract_i16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_a = extractelement <2 x i16> undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = extractelement <2 x i16> undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = extractelement <2 x i16> undef, i32 1 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_a = extractelement <4 x i16> undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = extractelement <4 x i16> undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = extractelement <4 x i16> undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31 +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'extract_i16' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_a = extractelement <2 x i16> undef, i32 %arg @@ -882,71 +857,38 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'extract_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = extractelement <2 x i8> undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = extractelement <2 x i8> undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_1 = extractelement <2 x i8> undef, i32 1 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_a = extractelement <4 x i8> undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = extractelement <4 x i8> undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = extractelement <4 x i8> undef, i32 3 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_a = extractelement <8 x i8> undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = extractelement <8 x i8> undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = extractelement <8 x i8> undef, i32 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512BW-LABEL: 'extract_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = extractelement <2 x i8> undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = extractelement <2 x i8> undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_1 = extractelement <2 x i8> undef, i32 1 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_a = extractelement <4 x i8> undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = extractelement <4 x i8> undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = extractelement <4 x i8> undef, i32 3 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_a = extractelement <8 x i8> undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = extractelement <8 x i8> undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = extractelement <8 x i8> undef, i32 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'extract_i8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = extractelement <2 x i8> undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = extractelement <2 x i8> undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_1 = extractelement <2 x i8> undef, i32 1 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_a = extractelement <4 x i8> undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = extractelement <4 x i8> undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = extractelement <4 x i8> undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_a = extractelement <8 x i8> undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = extractelement <8 x i8> undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = extractelement <8 x i8> undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48 +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63 +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'extract_i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = extractelement <2 x i8> undef, i32 %arg diff --git a/llvm/test/Analysis/CostModel/X86/vector-insert.ll b/llvm/test/Analysis/CostModel/X86/vector-insert.ll --- a/llvm/test/Analysis/CostModel/X86/vector-insert.ll +++ b/llvm/test/Analysis/CostModel/X86/vector-insert.ll @@ -606,55 +606,30 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'insert_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_a = insertelement <2 x i16> undef, i16 undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = insertelement <2 x i16> undef, i16 undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = insertelement <2 x i16> undef, i16 undef, i32 1 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_a = insertelement <4 x i16> undef, i16 undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = insertelement <4 x i16> undef, i16 undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = insertelement <4 x i16> undef, i16 undef, i32 3 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512BW-LABEL: 'insert_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_a = insertelement <2 x i16> undef, i16 undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = insertelement <2 x i16> undef, i16 undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = insertelement <2 x i16> undef, i16 undef, i32 1 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_a = insertelement <4 x i16> undef, i16 undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = insertelement <4 x i16> undef, i16 undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = insertelement <4 x i16> undef, i16 undef, i32 3 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'insert_i16' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_a = insertelement <2 x i16> undef, i16 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = insertelement <2 x i16> undef, i16 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = insertelement <2 x i16> undef, i16 undef, i32 1 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_a = insertelement <4 x i16> undef, i16 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = insertelement <4 x i16> undef, i16 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = insertelement <4 x i16> undef, i16 undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = insertelement <8 x i16> undef, i16 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> undef, i16 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> undef, i16 undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = insertelement <16 x i16> undef, i16 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> undef, i16 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> undef, i16 undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> undef, i16 undef, i32 8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> undef, i16 undef, i32 15 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = insertelement <32 x i16> undef, i16 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> undef, i16 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> undef, i16 undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> undef, i16 undef, i32 8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> undef, i16 undef, i32 15 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_16 = insertelement <32 x i16> undef, i16 undef, i32 16 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> undef, i16 undef, i32 24 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> undef, i16 undef, i32 31 +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %v2i16_a = insertelement <2 x i16> undef, i16 undef, i32 %arg %v2i16_0 = insertelement <2 x i16> undef, i16 undef, i32 0 @@ -852,71 +827,38 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX512F-LABEL: 'insert_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = insertelement <2 x i8> undef, i8 undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_3 = insertelement <2 x i8> undef, i8 undef, i32 1 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_a = insertelement <4 x i8> undef, i8 undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = insertelement <4 x i8> undef, i8 undef, i32 3 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_a = insertelement <8 x i8> undef, i8 undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = insertelement <8 x i8> undef, i8 undef, i32 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX512BW-LABEL: 'insert_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = insertelement <2 x i8> undef, i8 undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_3 = insertelement <2 x i8> undef, i8 undef, i32 1 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_a = insertelement <4 x i8> undef, i8 undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = insertelement <4 x i8> undef, i8 undef, i32 3 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_a = insertelement <8 x i8> undef, i8 undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = insertelement <8 x i8> undef, i8 undef, i32 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX512-LABEL: 'insert_i8' +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = insertelement <2 x i8> undef, i8 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_3 = insertelement <2 x i8> undef, i8 undef, i32 1 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_a = insertelement <4 x i8> undef, i8 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = insertelement <4 x i8> undef, i8 undef, i32 3 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_a = insertelement <8 x i8> undef, i8 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = insertelement <8 x i8> undef, i8 undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> undef, i8 undef, i32 15 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = insertelement <32 x i8> undef, i8 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> undef, i8 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> undef, i8 undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> undef, i8 undef, i32 8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> undef, i8 undef, i32 15 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> undef, i8 undef, i32 24 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> undef, i8 undef, i32 31 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = insertelement <64 x i8> undef, i8 undef, i32 %arg +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> undef, i8 undef, i32 0 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> undef, i8 undef, i32 7 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> undef, i8 undef, i32 8 +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> undef, i8 undef, i32 15 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> undef, i8 undef, i32 24 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> undef, i8 undef, i32 31 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_32 = insertelement <64 x i8> undef, i8 undef, i32 32 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> undef, i8 undef, i32 48 +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> undef, i8 undef, i32 63 +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'insert_i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = insertelement <2 x i8> undef, i8 undef, i32 %arg diff --git a/llvm/test/CodeGen/X86/avg-mask.ll b/llvm/test/CodeGen/X86/avg-mask.ll --- a/llvm/test/CodeGen/X86/avg-mask.ll +++ b/llvm/test/CodeGen/X86/avg-mask.ll @@ -123,33 +123,32 @@ define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind { ; AVX512F-LABEL: avg_v64i8_mask: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: movl %edi, %ecx ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: shrq $32, %rdi ; AVX512F-NEXT: shrq $48, %rax ; AVX512F-NEXT: shrl $16, %ecx -; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vpavgb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: kmovw %eax, %k3 ; AVX512F-NEXT: kmovw %edi, %k4 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} -; AVX512F-NEXT: vpmovdb %zmm5, %xmm5 -; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BWVL-LABEL: avg_v64i8_mask: @@ -178,26 +177,26 @@ ; AVX512F-NEXT: shrq $32, %rdi ; AVX512F-NEXT: shrq $48, %rax ; AVX512F-NEXT: shrl $16, %ecx -; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpavgb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: kmovw %eax, %k3 ; AVX512F-NEXT: kmovw %edi, %k4 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} -; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BWVL-LABEL: avg_v64i8_maskz: @@ -327,21 +326,20 @@ define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind { ; AVX512F-LABEL: avg_v32i16_mask: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: shrl $16, %edi -; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vpavgw %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BWVL-LABEL: avg_v32i16_mask: @@ -366,18 +364,18 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: shrl $16, %edi -; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpavgw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BWVL-LABEL: avg_v32i16_maskz: diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -624,31 +624,18 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: avg_v48i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqu %xmm1, (%rax) -; AVX512F-NEXT: vmovdqu %xmm0, (%rax) -; AVX512F-NEXT: vmovdqu %xmm2, (%rax) -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: avg_v48i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2 -; AVX512BW-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqu %xmm1, (%rax) -; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) -; AVX512BW-NEXT: vmovdqu %xmm2, (%rax) -; AVX512BW-NEXT: retq +; AVX512-LABEL: avg_v48i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2 +; AVX512-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX512-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 +; AVX512-NEXT: vmovdqu %xmm1, (%rax) +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vmovdqu %xmm2, (%rax) +; AVX512-NEXT: retq %1 = load <48 x i8>, <48 x i8>* %a %2 = load <48 x i8>, <48 x i8>* %b %3 = zext <48 x i8> %1 to <48 x i32> @@ -2189,49 +2176,49 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: pushq %rbp ; AVX512F-NEXT: movq %rsp, %rbp -; AVX512F-NEXT: andq $-32, %rsp -; AVX512F-NEXT: subq $32, %rsp +; AVX512F-NEXT: andq $-64, %rsp +; AVX512F-NEXT: subq $64, %rsp ; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: vmovdqa 240(%rbp), %ymm8 -; AVX512F-NEXT: vmovdqa 208(%rbp), %ymm9 -; AVX512F-NEXT: vmovdqa 176(%rbp), %ymm10 -; AVX512F-NEXT: vmovdqa 144(%rbp), %ymm11 -; AVX512F-NEXT: vmovdqa 112(%rbp), %ymm12 -; AVX512F-NEXT: vmovdqa 80(%rbp), %ymm13 -; AVX512F-NEXT: vmovdqa 48(%rbp), %ymm14 -; AVX512F-NEXT: vmovdqa 16(%rbp), %ymm15 -; AVX512F-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1 -; AVX512F-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2 -; AVX512F-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3 -; AVX512F-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4 -; AVX512F-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5 -; AVX512F-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6 +; AVX512F-NEXT: vpavgb 16(%rbp), %ymm0, %ymm8 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpavgb 48(%rbp), %ymm0, %ymm0 +; AVX512F-NEXT: vpavgb 80(%rbp), %ymm1, %ymm9 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpavgb 112(%rbp), %ymm1, %ymm1 +; AVX512F-NEXT: vpavgb 144(%rbp), %ymm2, %ymm10 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512F-NEXT: vpavgb 176(%rbp), %ymm2, %ymm2 +; AVX512F-NEXT: vpavgb 208(%rbp), %ymm3, %ymm11 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512F-NEXT: vpavgb 240(%rbp), %ymm3, %ymm3 +; AVX512F-NEXT: vpavgb 272(%rbp), %ymm4, %ymm12 +; AVX512F-NEXT: vextracti64x4 $1, %zmm4, %ymm4 +; AVX512F-NEXT: vpavgb 304(%rbp), %ymm4, %ymm4 +; AVX512F-NEXT: vpavgb 336(%rbp), %ymm5, %ymm13 +; AVX512F-NEXT: vextracti64x4 $1, %zmm5, %ymm5 +; AVX512F-NEXT: vpavgb 368(%rbp), %ymm5, %ymm5 +; AVX512F-NEXT: vpavgb 400(%rbp), %ymm6, %ymm14 +; AVX512F-NEXT: vextracti64x4 $1, %zmm6, %ymm6 +; AVX512F-NEXT: vpavgb 432(%rbp), %ymm6, %ymm6 +; AVX512F-NEXT: vpavgb 464(%rbp), %ymm7, %ymm15 +; AVX512F-NEXT: vextracti64x4 $1, %zmm7, %ymm7 ; AVX512F-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7 -; AVX512F-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15 -; AVX512F-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14 -; AVX512F-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13 -; AVX512F-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12 -; AVX512F-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11 -; AVX512F-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10 -; AVX512F-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9 -; AVX512F-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8 -; AVX512F-NEXT: vmovdqa %ymm8, 480(%rdi) -; AVX512F-NEXT: vmovdqa %ymm9, 448(%rdi) -; AVX512F-NEXT: vmovdqa %ymm10, 416(%rdi) -; AVX512F-NEXT: vmovdqa %ymm11, 384(%rdi) -; AVX512F-NEXT: vmovdqa %ymm12, 352(%rdi) +; AVX512F-NEXT: vmovdqa %ymm7, 480(%rdi) +; AVX512F-NEXT: vmovdqa %ymm15, 448(%rdi) +; AVX512F-NEXT: vmovdqa %ymm6, 416(%rdi) +; AVX512F-NEXT: vmovdqa %ymm14, 384(%rdi) +; AVX512F-NEXT: vmovdqa %ymm5, 352(%rdi) ; AVX512F-NEXT: vmovdqa %ymm13, 320(%rdi) -; AVX512F-NEXT: vmovdqa %ymm14, 288(%rdi) -; AVX512F-NEXT: vmovdqa %ymm15, 256(%rdi) -; AVX512F-NEXT: vmovdqa %ymm7, 224(%rdi) -; AVX512F-NEXT: vmovdqa %ymm6, 192(%rdi) -; AVX512F-NEXT: vmovdqa %ymm5, 160(%rdi) -; AVX512F-NEXT: vmovdqa %ymm4, 128(%rdi) -; AVX512F-NEXT: vmovdqa %ymm3, 96(%rdi) -; AVX512F-NEXT: vmovdqa %ymm2, 64(%rdi) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdi) -; AVX512F-NEXT: vmovdqa %ymm0, (%rdi) +; AVX512F-NEXT: vmovdqa %ymm4, 288(%rdi) +; AVX512F-NEXT: vmovdqa %ymm12, 256(%rdi) +; AVX512F-NEXT: vmovdqa %ymm3, 224(%rdi) +; AVX512F-NEXT: vmovdqa %ymm11, 192(%rdi) +; AVX512F-NEXT: vmovdqa %ymm2, 160(%rdi) +; AVX512F-NEXT: vmovdqa %ymm10, 128(%rdi) +; AVX512F-NEXT: vmovdqa %ymm1, 96(%rdi) +; AVX512F-NEXT: vmovdqa %ymm9, 64(%rdi) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdi) +; AVX512F-NEXT: vmovdqa %ymm8, (%rdi) ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp ; AVX512F-NEXT: vzeroupper @@ -2834,411 +2821,277 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: not_avg_v16i8_wide_constants: -; AVX512F: # %bb.0: -; AVX512F-NEXT: pushq %rbp -; AVX512F-NEXT: pushq %r15 -; AVX512F-NEXT: pushq %r14 -; AVX512F-NEXT: pushq %r13 -; AVX512F-NEXT: pushq %r12 -; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX512F-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512F-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-NEXT: vpextrq $1, %xmm4, %r10 -; AVX512F-NEXT: vmovq %xmm4, %r11 -; AVX512F-NEXT: vpextrq $1, %xmm3, %r14 -; AVX512F-NEXT: vmovq %xmm3, %r13 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-NEXT: vpextrq $1, %xmm4, %r12 -; AVX512F-NEXT: vmovq %xmm4, %r9 -; AVX512F-NEXT: vpextrq $1, %xmm3, %rdi -; AVX512F-NEXT: vmovq %xmm3, %rsi -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-NEXT: vpextrq $1, %xmm3, %rcx -; AVX512F-NEXT: vmovq %xmm3, %rdx -; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vpextrq $1, %xmm2, %rbp -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512F-NEXT: vpextrq $1, %xmm4, %rbx -; AVX512F-NEXT: leaq -1(%rbp,%rbx), %rbp -; AVX512F-NEXT: vmovq %xmm4, %rbx -; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512F-NEXT: leaq -1(%rax,%rbx), %rax -; AVX512F-NEXT: vmovq %xmm4, %rbx -; AVX512F-NEXT: leaq -1(%rdx,%rbx), %rdx -; AVX512F-NEXT: vpextrq $1, %xmm4, %rbx -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512F-NEXT: leaq -1(%rcx,%rbx), %rcx -; AVX512F-NEXT: vmovq %xmm1, %rbx -; AVX512F-NEXT: leaq -1(%rsi,%rbx), %rsi -; AVX512F-NEXT: vpextrq $1, %xmm1, %rbx -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: leaq -1(%rdi,%rbx), %r8 -; AVX512F-NEXT: vmovq %xmm1, %rbx -; AVX512F-NEXT: leaq -1(%r9,%rbx), %r15 -; AVX512F-NEXT: vpextrq $1, %xmm1, %rbx -; AVX512F-NEXT: leaq -1(%r12,%rbx), %r9 -; AVX512F-NEXT: vmovq %xmm3, %rbx -; AVX512F-NEXT: leaq -1(%r13,%rbx), %r13 -; AVX512F-NEXT: vpextrq $1, %xmm3, %rbx -; AVX512F-NEXT: leaq -1(%r14,%rbx), %r12 -; AVX512F-NEXT: vmovq %xmm4, %rbx -; AVX512F-NEXT: leaq -1(%r11,%rbx), %r11 -; AVX512F-NEXT: vpextrq $1, %xmm4, %rbx -; AVX512F-NEXT: leaq -1(%r10,%rbx), %r14 -; AVX512F-NEXT: vmovq %xmm2, %rbx -; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512F-NEXT: leaq -1(%rdi,%rbx), %rdi -; AVX512F-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512F-NEXT: vpextrq $1, %xmm2, %rbx -; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512F-NEXT: leaq -1(%rdi,%rbx), %rdi -; AVX512F-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512F-NEXT: vmovq %xmm0, %rbx -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX512F-NEXT: vmovq %xmm1, %r10 -; AVX512F-NEXT: leaq -1(%rbx,%r10), %rdi -; AVX512F-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512F-NEXT: vpextrq $1, %xmm0, %rbx -; AVX512F-NEXT: vpextrq $1, %xmm1, %r10 -; AVX512F-NEXT: leaq -1(%rbx,%r10), %rdi -; AVX512F-NEXT: shrq %rax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: shrq %rbp -; AVX512F-NEXT: vpinsrb $1, %ebp, %xmm0, %xmm0 -; AVX512F-NEXT: shrq %rdx -; AVX512F-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; AVX512F-NEXT: shrq %rcx -; AVX512F-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX512F-NEXT: shrq %rsi -; AVX512F-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0 -; AVX512F-NEXT: shrq %r8 -; AVX512F-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0 -; AVX512F-NEXT: shrq %r15 -; AVX512F-NEXT: vpinsrb $6, %r15d, %xmm0, %xmm0 -; AVX512F-NEXT: shrq %r9 -; AVX512F-NEXT: vpinsrb $7, %r9d, %xmm0, %xmm0 -; AVX512F-NEXT: shrq %r13 -; AVX512F-NEXT: vpinsrb $8, %r13d, %xmm0, %xmm0 -; AVX512F-NEXT: shrq %r12 -; AVX512F-NEXT: vpinsrb $9, %r12d, %xmm0, %xmm0 -; AVX512F-NEXT: shrq %r11 -; AVX512F-NEXT: vpinsrb $10, %r11d, %xmm0, %xmm0 -; AVX512F-NEXT: shrq %r14 -; AVX512F-NEXT: vpinsrb $11, %r14d, %xmm0, %xmm0 -; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512F-NEXT: shrq %rax -; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512F-NEXT: shrq %rax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512F-NEXT: shrq %rax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: shrq %rdi -; AVX512F-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqu %xmm0, (%rax) -; AVX512F-NEXT: popq %rbx -; AVX512F-NEXT: popq %r12 -; AVX512F-NEXT: popq %r13 -; AVX512F-NEXT: popq %r14 -; AVX512F-NEXT: popq %r15 -; AVX512F-NEXT: popq %rbp -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: not_avg_v16i8_wide_constants: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: pushq %rbp -; AVX512BW-NEXT: pushq %r15 -; AVX512BW-NEXT: pushq %r14 -; AVX512BW-NEXT: pushq %r13 -; AVX512BW-NEXT: pushq %r12 -; AVX512BW-NEXT: pushq %rbx -; AVX512BW-NEXT: subq $24, %rsp -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512BW-NEXT: vmovq %xmm3, %rbx -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rbp -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-NEXT: vmovq %xmm3, %rdi -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rsi -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512BW-NEXT: vmovq %xmm2, %rdx -; AVX512BW-NEXT: vpextrq $1, %xmm2, %r15 -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vmovq %xmm2, %r8 -; AVX512BW-NEXT: vpextrq $1, %xmm2, %r9 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512BW-NEXT: vmovq %xmm2, %r11 -; AVX512BW-NEXT: vpextrq $1, %xmm2, %r10 -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512BW-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512BW-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512BW-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512BW-NEXT: vmovq %xmm3, %rcx -; AVX512BW-NEXT: addq %rbx, %rcx -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax -; AVX512BW-NEXT: addq %rbp, %rax -; AVX512BW-NEXT: movq %rax, %rbp -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-NEXT: vmovq %xmm3, %r14 -; AVX512BW-NEXT: addq %rdi, %r14 -; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax -; AVX512BW-NEXT: addq %rsi, %rax -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512BW-NEXT: vmovq %xmm2, %rax -; AVX512BW-NEXT: addq %rdx, %rax -; AVX512BW-NEXT: movq %rax, %rdx -; AVX512BW-NEXT: vpextrq $1, %xmm2, %r12 -; AVX512BW-NEXT: addq %r15, %r12 -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vmovq %xmm2, %rax -; AVX512BW-NEXT: addq %r8, %rax -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax -; AVX512BW-NEXT: addq %r9, %rax -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512BW-NEXT: vmovq %xmm2, %rax -; AVX512BW-NEXT: addq %r11, %rax -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax -; AVX512BW-NEXT: addq %r10, %rax -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vmovq %xmm2, %r13 -; AVX512BW-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX512BW-NEXT: vpextrq $1, %xmm2, %rbx -; AVX512BW-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512BW-NEXT: vmovq %xmm0, %r10 -; AVX512BW-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; AVX512BW-NEXT: vpextrq $1, %xmm0, %r9 -; AVX512BW-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX512BW-NEXT: vmovq %xmm1, %rax -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %r8 -; AVX512BW-NEXT: addq %rax, %r8 -; AVX512BW-NEXT: vpextrq $1, %xmm1, %rdi -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512BW-NEXT: addq %rdi, %rsi -; AVX512BW-NEXT: addq $-1, %rcx -; AVX512BW-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: movl $0, %r11d -; AVX512BW-NEXT: adcq $-1, %r11 -; AVX512BW-NEXT: addq $-1, %rbp -; AVX512BW-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: movl $0, %edi -; AVX512BW-NEXT: adcq $-1, %rdi -; AVX512BW-NEXT: addq $-1, %r14 -; AVX512BW-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: movl $0, %r15d -; AVX512BW-NEXT: adcq $-1, %r15 -; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512BW-NEXT: movl $0, %r14d -; AVX512BW-NEXT: adcq $-1, %r14 -; AVX512BW-NEXT: addq $-1, %rdx -; AVX512BW-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: movl $0, %eax -; AVX512BW-NEXT: adcq $-1, %rax -; AVX512BW-NEXT: movq %rax, (%rsp) # 8-byte Spill -; AVX512BW-NEXT: addq $-1, %r12 -; AVX512BW-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: movl $0, %r12d -; AVX512BW-NEXT: adcq $-1, %r12 -; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512BW-NEXT: movl $0, %eax -; AVX512BW-NEXT: adcq $-1, %rax -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512BW-NEXT: movl $0, %eax -; AVX512BW-NEXT: adcq $-1, %rax -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512BW-NEXT: movl $0, %eax -; AVX512BW-NEXT: adcq $-1, %rax -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512BW-NEXT: movl $0, %eax -; AVX512BW-NEXT: adcq $-1, %rax -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: addq $-1, %r13 -; AVX512BW-NEXT: movl $0, %eax -; AVX512BW-NEXT: adcq $-1, %rax -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: addq $-1, %rbx -; AVX512BW-NEXT: movl $0, %eax -; AVX512BW-NEXT: adcq $-1, %rax -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: addq $-1, %r10 -; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: adcq $-1, %rdx -; AVX512BW-NEXT: addq $-1, %r9 -; AVX512BW-NEXT: movl $0, %ecx -; AVX512BW-NEXT: adcq $-1, %rcx -; AVX512BW-NEXT: addq $-1, %r8 -; AVX512BW-NEXT: movl $0, %eax -; AVX512BW-NEXT: adcq $-1, %rax -; AVX512BW-NEXT: addq $-1, %rsi -; AVX512BW-NEXT: movl $0, %ebp -; AVX512BW-NEXT: adcq $-1, %rbp -; AVX512BW-NEXT: shldq $63, %rsi, %rbp -; AVX512BW-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: shldq $63, %r8, %rax -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: shldq $63, %r9, %rcx -; AVX512BW-NEXT: movq %rcx, %rbp -; AVX512BW-NEXT: shldq $63, %r10, %rdx -; AVX512BW-NEXT: movq %rdx, %r9 -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512BW-NEXT: shldq $63, %rbx, %r10 -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512BW-NEXT: shldq $63, %r13, %r8 -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512BW-NEXT: shldq $63, %rax, %r13 -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512BW-NEXT: shldq $63, %rax, %rbx -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX512BW-NEXT: shldq $63, %rax, %rsi -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512BW-NEXT: shldq $63, %rax, %rdx -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: shldq $63, %rax, %r12 -; AVX512BW-NEXT: movq (%rsp), %rcx # 8-byte Reload -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: shldq $63, %rax, %rcx -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: shldq $63, %rax, %r14 -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: shldq $63, %rax, %r15 -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: shldq $63, %rax, %rdi -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: shldq $63, %rax, %r11 -; AVX512BW-NEXT: vmovq %r11, %xmm0 -; AVX512BW-NEXT: vmovq %rdi, %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vmovd %xmm1, %eax -; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm0, %xmm1 -; AVX512BW-NEXT: vmovq %r15, %xmm2 -; AVX512BW-NEXT: vmovq %r14, %xmm3 -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512BW-NEXT: vmovd %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovq %rcx, %xmm1 -; AVX512BW-NEXT: vmovq %r12, %xmm2 -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vmovd %xmm1, %eax -; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovd %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %rdx, %xmm2 -; AVX512BW-NEXT: vmovq %rsi, %xmm3 -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; AVX512BW-NEXT: vmovd %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1 -; AVX512BW-NEXT: vmovd %xmm1, %eax -; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %rbx, %xmm1 -; AVX512BW-NEXT: vmovq %r13, %xmm2 -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vmovd %xmm1, %eax -; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovd %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %r8, %xmm2 -; AVX512BW-NEXT: vmovq %r10, %xmm3 -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; AVX512BW-NEXT: vmovd %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1 -; AVX512BW-NEXT: vmovd %xmm1, %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %r9, %xmm1 -; AVX512BW-NEXT: vmovq %rbp, %xmm2 -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vmovd %xmm1, %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovd %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 8-byte Folded Reload -; AVX512BW-NEXT: # xmm2 = mem[0],zero -; AVX512BW-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 8-byte Folded Reload -; AVX512BW-NEXT: # xmm3 = mem[0],zero -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; AVX512BW-NEXT: vmovd %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1 -; AVX512BW-NEXT: vmovd %xmm1, %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) -; AVX512BW-NEXT: addq $24, %rsp -; AVX512BW-NEXT: popq %rbx -; AVX512BW-NEXT: popq %r12 -; AVX512BW-NEXT: popq %r13 -; AVX512BW-NEXT: popq %r14 -; AVX512BW-NEXT: popq %r15 -; AVX512BW-NEXT: popq %rbp -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: not_avg_v16i8_wide_constants: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vmovq %xmm3, %rbx +; AVX512-NEXT: vpextrq $1, %xmm3, %rbp +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512-NEXT: vmovq %xmm3, %rdi +; AVX512-NEXT: vpextrq $1, %xmm3, %rsi +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vmovq %xmm2, %rdx +; AVX512-NEXT: vpextrq $1, %xmm2, %r15 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %r8 +; AVX512-NEXT: vpextrq $1, %xmm2, %r9 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vmovq %xmm2, %r11 +; AVX512-NEXT: vpextrq $1, %xmm2, %r10 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vmovq %xmm3, %rcx +; AVX512-NEXT: addq %rbx, %rcx +; AVX512-NEXT: vpextrq $1, %xmm3, %rax +; AVX512-NEXT: addq %rbp, %rax +; AVX512-NEXT: movq %rax, %rbp +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512-NEXT: vmovq %xmm3, %r14 +; AVX512-NEXT: addq %rdi, %r14 +; AVX512-NEXT: vpextrq $1, %xmm3, %rax +; AVX512-NEXT: addq %rsi, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vmovq %xmm2, %rax +; AVX512-NEXT: addq %rdx, %rax +; AVX512-NEXT: movq %rax, %rdx +; AVX512-NEXT: vpextrq $1, %xmm2, %r12 +; AVX512-NEXT: addq %r15, %r12 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %rax +; AVX512-NEXT: addq %r8, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpextrq $1, %xmm2, %rax +; AVX512-NEXT: addq %r9, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vmovq %xmm2, %rax +; AVX512-NEXT: addq %r11, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpextrq $1, %xmm2, %rax +; AVX512-NEXT: addq %r10, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %r13 +; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; AVX512-NEXT: vpextrq $1, %xmm2, %rbx +; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vmovq %xmm0, %r10 +; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload +; AVX512-NEXT: vpextrq $1, %xmm0, %r9 +; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %r8 +; AVX512-NEXT: addq %rax, %r8 +; AVX512-NEXT: vpextrq $1, %xmm1, %rdi +; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: addq %rdi, %rsi +; AVX512-NEXT: addq $-1, %rcx +; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movl $0, %r11d +; AVX512-NEXT: adcq $-1, %r11 +; AVX512-NEXT: addq $-1, %rbp +; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movl $0, %edi +; AVX512-NEXT: adcq $-1, %rdi +; AVX512-NEXT: addq $-1, %r14 +; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movl $0, %r15d +; AVX512-NEXT: adcq $-1, %r15 +; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movl $0, %r14d +; AVX512-NEXT: adcq $-1, %r14 +; AVX512-NEXT: addq $-1, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: adcq $-1, %rax +; AVX512-NEXT: movq %rax, (%rsp) # 8-byte Spill +; AVX512-NEXT: addq $-1, %r12 +; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movl $0, %r12d +; AVX512-NEXT: adcq $-1, %r12 +; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: adcq $-1, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: adcq $-1, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: adcq $-1, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: adcq $-1, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addq $-1, %r13 +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: adcq $-1, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addq $-1, %rbx +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: adcq $-1, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addq $-1, %r10 +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: adcq $-1, %rdx +; AVX512-NEXT: addq $-1, %r9 +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: adcq $-1, %rcx +; AVX512-NEXT: addq $-1, %r8 +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: adcq $-1, %rax +; AVX512-NEXT: addq $-1, %rsi +; AVX512-NEXT: movl $0, %ebp +; AVX512-NEXT: adcq $-1, %rbp +; AVX512-NEXT: shldq $63, %rsi, %rbp +; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq $63, %r8, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq $63, %r9, %rcx +; AVX512-NEXT: movq %rcx, %rbp +; AVX512-NEXT: shldq $63, %r10, %rdx +; AVX512-NEXT: movq %rdx, %r9 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: shldq $63, %rbx, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: shldq $63, %r13, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq $63, %rax, %r13 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX512-NEXT: shldq $63, %rax, %rbx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX512-NEXT: shldq $63, %rax, %rsi +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX512-NEXT: shldq $63, %rax, %rdx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: shldq $63, %rax, %r12 +; AVX512-NEXT: movq (%rsp), %rcx # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: shldq $63, %rax, %rcx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: shldq $63, %rax, %r14 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: shldq $63, %rax, %r15 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: shldq $63, %rax, %rdi +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: shldq $63, %rax, %r11 +; AVX512-NEXT: vmovq %r11, %xmm0 +; AVX512-NEXT: vmovq %rdi, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vpinsrb $1, %eax, %xmm0, %xmm1 +; AVX512-NEXT: vmovq %r15, %xmm2 +; AVX512-NEXT: vmovq %r14, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0 +; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: vmovq %r12, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %rdx, %xmm2 +; AVX512-NEXT: vmovq %rsi, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %rbx, %xmm1 +; AVX512-NEXT: vmovq %r13, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %r8, %xmm2 +; AVX512-NEXT: vmovq %r10, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %r9, %xmm1 +; AVX512-NEXT: vmovq %rbp, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 8-byte Folded Reload +; AVX512-NEXT: # xmm2 = mem[0],zero +; AVX512-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 8-byte Folded Reload +; AVX512-NEXT: # xmm3 = mem[0],zero +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a %2 = load <16 x i8>, <16 x i8>* %b %3 = zext <16 x i8> %1 to <16 x i128> diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL --check-prefix=KNL-NEW -; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl -x86-enable-old-knl-abi | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL --check-prefix=KNL-OLD +; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=skx | FileCheck %s --check-prefix=ALL_X64 --check-prefix=SKX ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=KNL_X32 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=skx -fast-isel | FileCheck %s --check-prefix=FASTISEL @@ -552,31 +551,17 @@ } define void @test14(<32 x i16>* %x) { -; KNL-NEW-LABEL: test14: -; KNL-NEW: ## %bb.0: -; KNL-NEW-NEXT: pushq %rbx -; KNL-NEW-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEW-NEXT: .cfi_offset %rbx, -16 -; KNL-NEW-NEXT: movq %rdi, %rbx -; KNL-NEW-NEXT: vmovaps (%rdi), %zmm0 -; KNL-NEW-NEXT: callq _test14_callee -; KNL-NEW-NEXT: vmovaps %zmm0, (%rbx) -; KNL-NEW-NEXT: popq %rbx -; KNL-NEW-NEXT: retq -; -; KNL-OLD-LABEL: test14: -; KNL-OLD: ## %bb.0: -; KNL-OLD-NEXT: pushq %rbx -; KNL-OLD-NEXT: .cfi_def_cfa_offset 16 -; KNL-OLD-NEXT: .cfi_offset %rbx, -16 -; KNL-OLD-NEXT: movq %rdi, %rbx -; KNL-OLD-NEXT: vmovaps (%rdi), %ymm0 -; KNL-OLD-NEXT: vmovaps 32(%rdi), %ymm1 -; KNL-OLD-NEXT: callq _test14_callee -; KNL-OLD-NEXT: vmovaps %ymm1, 32(%rbx) -; KNL-OLD-NEXT: vmovaps %ymm0, (%rbx) -; KNL-OLD-NEXT: popq %rbx -; KNL-OLD-NEXT: retq +; KNL-LABEL: test14: +; KNL: ## %bb.0: +; KNL-NEXT: pushq %rbx +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: .cfi_offset %rbx, -16 +; KNL-NEXT: movq %rdi, %rbx +; KNL-NEXT: vmovaps (%rdi), %zmm0 +; KNL-NEXT: callq _test14_callee +; KNL-NEXT: vmovaps %zmm0, (%rbx) +; KNL-NEXT: popq %rbx +; KNL-NEXT: retq ; ; SKX-LABEL: test14: ; SKX: ## %bb.0: @@ -626,31 +611,17 @@ declare <32 x i16> @test14_callee(<32 x i16>) define void @test15(<64 x i8>* %x) { -; KNL-NEW-LABEL: test15: -; KNL-NEW: ## %bb.0: -; KNL-NEW-NEXT: pushq %rbx -; KNL-NEW-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEW-NEXT: .cfi_offset %rbx, -16 -; KNL-NEW-NEXT: movq %rdi, %rbx -; KNL-NEW-NEXT: vmovaps (%rdi), %zmm0 -; KNL-NEW-NEXT: callq _test15_callee -; KNL-NEW-NEXT: vmovaps %zmm0, (%rbx) -; KNL-NEW-NEXT: popq %rbx -; KNL-NEW-NEXT: retq -; -; KNL-OLD-LABEL: test15: -; KNL-OLD: ## %bb.0: -; KNL-OLD-NEXT: pushq %rbx -; KNL-OLD-NEXT: .cfi_def_cfa_offset 16 -; KNL-OLD-NEXT: .cfi_offset %rbx, -16 -; KNL-OLD-NEXT: movq %rdi, %rbx -; KNL-OLD-NEXT: vmovaps (%rdi), %ymm0 -; KNL-OLD-NEXT: vmovaps 32(%rdi), %ymm1 -; KNL-OLD-NEXT: callq _test15_callee -; KNL-OLD-NEXT: vmovaps %ymm1, 32(%rbx) -; KNL-OLD-NEXT: vmovaps %ymm0, (%rbx) -; KNL-OLD-NEXT: popq %rbx -; KNL-OLD-NEXT: retq +; KNL-LABEL: test15: +; KNL: ## %bb.0: +; KNL-NEXT: pushq %rbx +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: .cfi_offset %rbx, -16 +; KNL-NEXT: movq %rdi, %rbx +; KNL-NEXT: vmovaps (%rdi), %zmm0 +; KNL-NEXT: callq _test15_callee +; KNL-NEXT: vmovaps %zmm0, (%rbx) +; KNL-NEXT: popq %rbx +; KNL-NEXT: retq ; ; SKX-LABEL: test15: ; SKX: ## %bb.0: diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -205,18 +205,18 @@ define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone { ; KNL-LABEL: zext_32x8mem_to_32x16: ; KNL: # %bb.0: -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; KNL-NEXT: vpsllw $15, %ymm1, %ymm1 ; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 -; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1 -; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_32x8mem_to_32x16: @@ -228,18 +228,18 @@ ; ; AVX512DQNOBW-LABEL: zext_32x8mem_to_32x16: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0 -; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1 -; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQNOBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; AVX512DQNOBW-NEXT: retq %a = load <32 x i8>,<32 x i8> *%i,align 1 %x = zext <32 x i8> %a to <32 x i16> @@ -250,18 +250,18 @@ define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone { ; KNL-LABEL: sext_32x8mem_to_32x16: ; KNL: # %bb.0: -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; KNL-NEXT: vpmovsxbw (%rdi), %ymm2 -; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm3 +; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm2 +; KNL-NEXT: vpmovsxbw (%rdi), %ymm3 +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; KNL-NEXT: vpsllw $15, %ymm1, %ymm1 ; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 -; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1 -; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: sext_32x8mem_to_32x16: @@ -273,18 +273,18 @@ ; ; AVX512DQNOBW-LABEL: sext_32x8mem_to_32x16: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %ymm2 -; AVX512DQNOBW-NEXT: vpmovsxbw 16(%rdi), %ymm3 +; AVX512DQNOBW-NEXT: vpmovsxbw 16(%rdi), %ymm2 +; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %ymm3 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0 -; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1 -; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQNOBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; AVX512DQNOBW-NEXT: retq %a = load <32 x i8>,<32 x i8> *%i,align 1 %x = sext <32 x i8> %a to <32 x i16> @@ -295,10 +295,10 @@ define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { ; KNL-LABEL: zext_32x8_to_32x16: ; KNL: # %bb.0: -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_32x8_to_32x16: @@ -308,10 +308,10 @@ ; ; AVX512DQNOBW-LABEL: zext_32x8_to_32x16: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQNOBW-NEXT: retq %x = zext <32 x i8> %a to <32 x i16> ret <32 x i16> %x @@ -320,19 +320,19 @@ define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone { ; KNL-LABEL: zext_32x8_to_32x16_mask: ; KNL: # %bb.0: -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; KNL-NEXT: vpsllw $15, %ymm1, %ymm1 ; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 -; KNL-NEXT: vpand %ymm0, %ymm1, %ymm0 -; KNL-NEXT: vpsllw $15, %ymm2, %ymm1 -; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 -; KNL-NEXT: vpand %ymm3, %ymm1, %ymm1 -; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: vpsllw $15, %ymm2, %ymm2 +; KNL-NEXT: vpsraw $15, %ymm2, %ymm2 +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; KNL-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_32x8_to_32x16_mask: @@ -344,19 +344,19 @@ ; ; AVX512DQNOBW-LABEL: zext_32x8_to_32x16_mask: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1 -; AVX512DQNOBW-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX512DQNOBW-NEXT: vpsllw $15, %ymm2, %ymm1 -; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1 -; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQNOBW-NEXT: vpsllw $15, %ymm2, %ymm2 +; AVX512DQNOBW-NEXT: vpsraw $15, %ymm2, %ymm2 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQNOBW-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; AVX512DQNOBW-NEXT: retq %x = zext <32 x i8> %a to <32 x i16> %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer @@ -366,10 +366,10 @@ define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { ; KNL-LABEL: sext_32x8_to_32x16: ; KNL: # %bb.0: -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; KNL-NEXT: vpmovsxbw %xmm1, %ymm1 +; KNL-NEXT: vpmovsxbw %xmm0, %ymm1 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovsxbw %xmm0, %ymm0 -; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: sext_32x8_to_32x16: @@ -379,10 +379,10 @@ ; ; AVX512DQNOBW-LABEL: sext_32x8_to_32x16: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQNOBW-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm1 +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQNOBW-NEXT: retq %x = sext <32 x i8> %a to <32 x i16> ret <32 x i16> %x @@ -391,19 +391,19 @@ define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone { ; KNL-LABEL: sext_32x8_to_32x16_mask: ; KNL: # %bb.0: -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; KNL-NEXT: vpmovsxbw %xmm0, %ymm3 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovsxbw %xmm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; KNL-NEXT: vpsllw $15, %ymm1, %ymm1 ; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 -; KNL-NEXT: vpand %ymm0, %ymm1, %ymm0 -; KNL-NEXT: vpsllw $15, %ymm2, %ymm1 -; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 -; KNL-NEXT: vpand %ymm3, %ymm1, %ymm1 -; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: vpsllw $15, %ymm2, %ymm2 +; KNL-NEXT: vpsraw $15, %ymm2, %ymm2 +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; KNL-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: sext_32x8_to_32x16_mask: @@ -415,19 +415,19 @@ ; ; AVX512DQNOBW-LABEL: sext_32x8_to_32x16_mask: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm3 ; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1 -; AVX512DQNOBW-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX512DQNOBW-NEXT: vpsllw $15, %ymm2, %ymm1 -; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1 -; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQNOBW-NEXT: vpsllw $15, %ymm2, %ymm2 +; AVX512DQNOBW-NEXT: vpsraw $15, %ymm2, %ymm2 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQNOBW-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; AVX512DQNOBW-NEXT: retq %x = sext <32 x i8> %a to <32 x i16> %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer @@ -1888,11 +1888,13 @@ ; KNL: # %bb.0: ; KNL-NEXT: movw $-3, %ax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k0 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k1, %k2 ; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $14, %k1, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 @@ -1900,7 +1902,8 @@ ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: kandw %k1, %k0, %k0 -; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $13, %k1, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 @@ -1908,7 +1911,8 @@ ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: kandw %k1, %k0, %k0 -; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $12, %k1, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 @@ -1916,7 +1920,8 @@ ; KNL-NEXT: kmovw %eax, %k6 ; KNL-NEXT: kandw %k6, %k0, %k0 ; KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; KNL-NEXT: kmovw %r8d, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $11, %k1, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 @@ -1925,7 +1930,8 @@ ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k1, %k3 ; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; KNL-NEXT: kmovw %r9d, %k1 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $10, %k1, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 @@ -2018,37 +2024,31 @@ ; KNL-NEXT: kshiftlw $15, %k7, %k7 ; KNL-NEXT: korw %k7, %k1, %k1 ; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: kandw %k2, %k1, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kmovw %esi, %k7 ; KNL-NEXT: kshiftlw $15, %k7, %k7 ; KNL-NEXT: kshiftrw $14, %k7, %k7 ; KNL-NEXT: korw %k7, %k1, %k1 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; KNL-NEXT: kandw %k0, %k1, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kmovw %edx, %k7 ; KNL-NEXT: kshiftlw $15, %k7, %k7 ; KNL-NEXT: kshiftrw $13, %k7, %k7 ; KNL-NEXT: korw %k7, %k1, %k1 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; KNL-NEXT: kandw %k2, %k1, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kmovw %ecx, %k7 ; KNL-NEXT: kshiftlw $15, %k7, %k7 ; KNL-NEXT: kshiftrw $12, %k7, %k7 ; KNL-NEXT: korw %k7, %k1, %k1 ; KNL-NEXT: kandw %k6, %k1, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kmovw %r8d, %k7 ; KNL-NEXT: kshiftlw $15, %k7, %k7 ; KNL-NEXT: kshiftrw $11, %k7, %k7 ; KNL-NEXT: korw %k7, %k1, %k1 ; KNL-NEXT: kandw %k3, %k1, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kmovw %r9d, %k7 ; KNL-NEXT: kshiftlw $15, %k7, %k7 ; KNL-NEXT: kshiftrw $10, %k7, %k7 ; KNL-NEXT: korw %k7, %k1, %k1 @@ -2319,20 +2319,20 @@ ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: korw %k2, %k0, %k2 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} -; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; KNL-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z} -; KNL-NEXT: vpmovdw %zmm4, %ymm4 -; KNL-NEXT: vpand %ymm1, %ymm4, %ymm1 -; KNL-NEXT: vpmovdw %zmm5, %ymm4 -; KNL-NEXT: vpand %ymm2, %ymm4, %ymm2 -; KNL-NEXT: vpmovdw %zmm6, %ymm4 -; KNL-NEXT: vpand %ymm3, %ymm4, %ymm3 -; KNL-NEXT: vpmovdw %zmm7, %ymm4 -; KNL-NEXT: vpand %ymm0, %ymm4, %ymm0 +; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} +; KNL-NEXT: vpmovdw %zmm2, %ymm2 +; KNL-NEXT: vpmovdw %zmm3, %ymm3 +; KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; KNL-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; KNL-NEXT: vpmovdw %zmm4, %ymm2 +; KNL-NEXT: vpmovdw %zmm5, %ymm3 +; KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; KNL-NEXT: vpandq %zmm0, %zmm2, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test21: @@ -2348,11 +2348,13 @@ ; AVX512DQNOBW: # %bb.0: ; AVX512DQNOBW-NEXT: movw $-3, %ax ; AVX512DQNOBW-NEXT: kmovw %eax, %k1 -; AVX512DQNOBW-NEXT: kmovw %edi, %k0 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k0 ; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 ; AVX512DQNOBW-NEXT: kmovw %k1, %k2 ; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQNOBW-NEXT: kmovw %esi, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k1 ; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 @@ -2360,7 +2362,8 @@ ; AVX512DQNOBW-NEXT: kmovw %eax, %k1 ; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw %edx, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512DQNOBW-NEXT: kshiftrw $13, %k1, %k1 ; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 @@ -2368,7 +2371,8 @@ ; AVX512DQNOBW-NEXT: kmovw %eax, %k1 ; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: kmovw %ecx, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512DQNOBW-NEXT: kshiftrw $12, %k1, %k1 ; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 @@ -2376,7 +2380,8 @@ ; AVX512DQNOBW-NEXT: kmovw %eax, %k6 ; AVX512DQNOBW-NEXT: kandw %k6, %k0, %k0 ; AVX512DQNOBW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQNOBW-NEXT: kmovw %r8d, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512DQNOBW-NEXT: kshiftrw $11, %k1, %k1 ; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 @@ -2385,7 +2390,8 @@ ; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 ; AVX512DQNOBW-NEXT: kmovw %k1, %k3 ; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQNOBW-NEXT: kmovw %r9d, %k1 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512DQNOBW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 @@ -2478,37 +2484,31 @@ ; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512DQNOBW-NEXT: korw %k7, %k0, %k0 ; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k0 +; AVX512DQNOBW-NEXT: kmovw %edi, %k0 ; AVX512DQNOBW-NEXT: kandw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kmovw %esi, %k7 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512DQNOBW-NEXT: kshiftrw $14, %k7, %k7 ; AVX512DQNOBW-NEXT: korw %k7, %k0, %k0 ; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512DQNOBW-NEXT: kandw %k1, %k0, %k0 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kmovw %edx, %k7 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512DQNOBW-NEXT: kshiftrw $13, %k7, %k7 ; AVX512DQNOBW-NEXT: korw %k7, %k0, %k0 ; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512DQNOBW-NEXT: kandw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kmovw %ecx, %k7 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512DQNOBW-NEXT: kshiftrw $12, %k7, %k7 ; AVX512DQNOBW-NEXT: korw %k7, %k0, %k0 ; AVX512DQNOBW-NEXT: kandw %k6, %k0, %k0 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kmovw %r8d, %k7 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512DQNOBW-NEXT: kshiftrw $11, %k7, %k7 ; AVX512DQNOBW-NEXT: korw %k7, %k0, %k0 ; AVX512DQNOBW-NEXT: kandw %k3, %k0, %k0 -; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kmovw %r9d, %k7 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512DQNOBW-NEXT: kshiftrw $10, %k7, %k7 ; AVX512DQNOBW-NEXT: korw %k7, %k0, %k0 @@ -2779,20 +2779,20 @@ ; AVX512DQNOBW-NEXT: kmovw %eax, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: korw %k2, %k1, %k1 -; AVX512DQNOBW-NEXT: vpmovm2d %k1, %zmm4 -; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm5 +; AVX512DQNOBW-NEXT: vpmovm2d %k1, %zmm2 +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm3 ; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm6 +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm4 ; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm7 -; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 -; AVX512DQNOBW-NEXT: vpand %ymm1, %ymm4, %ymm1 -; AVX512DQNOBW-NEXT: vpmovdw %zmm5, %ymm4 -; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm4, %ymm2 -; AVX512DQNOBW-NEXT: vpmovdw %zmm6, %ymm4 -; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm4, %ymm3 -; AVX512DQNOBW-NEXT: vpmovdw %zmm7, %ymm4 -; AVX512DQNOBW-NEXT: vpand %ymm0, %ymm4, %ymm0 +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm5 +; AVX512DQNOBW-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512DQNOBW-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQNOBW-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm2 +; AVX512DQNOBW-NEXT: vpmovdw %zmm5, %ymm3 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQNOBW-NEXT: vpandq %zmm0, %zmm2, %zmm0 ; AVX512DQNOBW-NEXT: retq %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer ret <64 x i16> %ret @@ -2905,11 +2905,9 @@ ; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; KNL-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm2 -; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; KNL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_64xi1_to_64xi8: @@ -2923,11 +2921,9 @@ ; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQNOBW-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm2 -; AVX512DQNOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512DQNOBW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQNOBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <64 x i8> %x, %y %1 = zext <64 x i1> %mask to <64 x i8> @@ -2940,10 +2936,9 @@ ; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; KNL-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 -; KNL-NEXT: vpsrlw $15, %ymm2, %ymm2 ; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_32xi1_to_32xi16: @@ -2958,10 +2953,9 @@ ; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQNOBW-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 -; AVX512DQNOBW-NEXT: vpsrlw $15, %ymm2, %ymm2 ; AVX512DQNOBW-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX512DQNOBW-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQNOBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <32 x i16> %x, %y %1 = zext <32 x i1> %mask to <32 x i16> diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -619,23 +619,14 @@ } define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) { -; KNL-LABEL: insert_v32i16: -; KNL: ## %bb.0: -; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; KNL-NEXT: vpinsrw $1, %edi, %xmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; KNL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; KNL-NEXT: retq -; -; SKX-LABEL: insert_v32i16: -; SKX: ## %bb.0: -; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 -; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; SKX-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 -; SKX-NEXT: retq +; CHECK-LABEL: insert_v32i16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 +; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 +; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %val = load i16, i16* %ptr %r1 = insertelement <32 x i16> %x, i16 %val, i32 1 %r2 = insertelement <32 x i16> %r1, i16 %y, i32 9 @@ -669,25 +660,14 @@ } define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) { -; KNL-LABEL: insert_v64i8: -; KNL: ## %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm2 -; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; KNL-NEXT: vpinsrb $2, %edi, %xmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: insert_v64i8: -; SKX: ## %bb.0: -; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 -; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; SKX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 -; SKX-NEXT: vinserti32x4 $3, %xmm0, %zmm1, %zmm0 -; SKX-NEXT: retq +; CHECK-LABEL: insert_v64i8: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 +; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 +; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; CHECK-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 +; CHECK-NEXT: vinserti32x4 $3, %xmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %val = load i8, i8* %ptr %r1 = insertelement <64 x i8> %x, i8 %val, i32 1 %r2 = insertelement <64 x i8> %r1, i8 %y, i32 50 @@ -1019,9 +999,9 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) { ; KNL-LABEL: test_extractelement_v64i1: ; KNL: ## %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpminub %xmm1, %xmm0, %xmm1 ; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 @@ -1055,9 +1035,9 @@ define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) { ; KNL-LABEL: extractelement_v64i1_alt: ; KNL: ## %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpminub %xmm1, %xmm0, %xmm1 ; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 @@ -1350,42 +1330,23 @@ } define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) { -; KNL-LABEL: test_extractelement_variable_v32i16: -; KNL: ## %bb.0: -; KNL-NEXT: pushq %rbp -; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: .cfi_offset %rbp, -16 -; KNL-NEXT: movq %rsp, %rbp -; KNL-NEXT: .cfi_def_cfa_register %rbp -; KNL-NEXT: andq $-64, %rsp -; KNL-NEXT: subq $128, %rsp -; KNL-NEXT: ## kill: def $edi killed $edi def $rdi -; KNL-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; KNL-NEXT: vmovaps %ymm0, (%rsp) -; KNL-NEXT: andl $31, %edi -; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax -; KNL-NEXT: movq %rbp, %rsp -; KNL-NEXT: popq %rbp -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: test_extractelement_variable_v32i16: -; SKX: ## %bb.0: -; SKX-NEXT: pushq %rbp -; SKX-NEXT: .cfi_def_cfa_offset 16 -; SKX-NEXT: .cfi_offset %rbp, -16 -; SKX-NEXT: movq %rsp, %rbp -; SKX-NEXT: .cfi_def_cfa_register %rbp -; SKX-NEXT: andq $-64, %rsp -; SKX-NEXT: subq $128, %rsp -; SKX-NEXT: ## kill: def $edi killed $edi def $rdi -; SKX-NEXT: vmovaps %zmm0, (%rsp) -; SKX-NEXT: andl $31, %edi -; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax -; SKX-NEXT: movq %rbp, %rsp -; SKX-NEXT: popq %rbp -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; CHECK-LABEL: test_extractelement_variable_v32i16: +; CHECK: ## %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-64, %rsp +; CHECK-NEXT: subq $128, %rsp +; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi +; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: andl $31, %edi +; CHECK-NEXT: movzwl (%rsp,%rdi,2), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %t2 = extractelement <32 x i16> %t1, i32 %index ret i16 %t2 } @@ -1426,86 +1387,47 @@ } define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) { -; KNL-LABEL: test_extractelement_variable_v64i8: -; KNL: ## %bb.0: -; KNL-NEXT: pushq %rbp -; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: .cfi_offset %rbp, -16 -; KNL-NEXT: movq %rsp, %rbp -; KNL-NEXT: .cfi_def_cfa_register %rbp -; KNL-NEXT: andq $-64, %rsp -; KNL-NEXT: subq $128, %rsp -; KNL-NEXT: ## kill: def $edi killed $edi def $rdi -; KNL-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; KNL-NEXT: vmovaps %ymm0, (%rsp) -; KNL-NEXT: andl $63, %edi -; KNL-NEXT: movb (%rsp,%rdi), %al -; KNL-NEXT: movq %rbp, %rsp -; KNL-NEXT: popq %rbp -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: test_extractelement_variable_v64i8: -; SKX: ## %bb.0: -; SKX-NEXT: pushq %rbp -; SKX-NEXT: .cfi_def_cfa_offset 16 -; SKX-NEXT: .cfi_offset %rbp, -16 -; SKX-NEXT: movq %rsp, %rbp -; SKX-NEXT: .cfi_def_cfa_register %rbp -; SKX-NEXT: andq $-64, %rsp -; SKX-NEXT: subq $128, %rsp -; SKX-NEXT: ## kill: def $edi killed $edi def $rdi -; SKX-NEXT: vmovaps %zmm0, (%rsp) -; SKX-NEXT: andl $63, %edi -; SKX-NEXT: movb (%rsp,%rdi), %al -; SKX-NEXT: movq %rbp, %rsp -; SKX-NEXT: popq %rbp -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; CHECK-LABEL: test_extractelement_variable_v64i8: +; CHECK: ## %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-64, %rsp +; CHECK-NEXT: subq $128, %rsp +; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi +; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: andl $63, %edi +; CHECK-NEXT: movb (%rsp,%rdi), %al +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %t2 = extractelement <64 x i8> %t1, i32 %index ret i8 %t2 } define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) { -; KNL-LABEL: test_extractelement_variable_v64i8_indexi8: -; KNL: ## %bb.0: -; KNL-NEXT: pushq %rbp -; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: .cfi_offset %rbp, -16 -; KNL-NEXT: movq %rsp, %rbp -; KNL-NEXT: .cfi_def_cfa_register %rbp -; KNL-NEXT: andq $-64, %rsp -; KNL-NEXT: subq $128, %rsp -; KNL-NEXT: addb %dil, %dil -; KNL-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; KNL-NEXT: vmovaps %ymm0, (%rsp) -; KNL-NEXT: movzbl %dil, %eax -; KNL-NEXT: andl $63, %eax -; KNL-NEXT: movb (%rsp,%rax), %al -; KNL-NEXT: movq %rbp, %rsp -; KNL-NEXT: popq %rbp -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: test_extractelement_variable_v64i8_indexi8: -; SKX: ## %bb.0: -; SKX-NEXT: pushq %rbp -; SKX-NEXT: .cfi_def_cfa_offset 16 -; SKX-NEXT: .cfi_offset %rbp, -16 -; SKX-NEXT: movq %rsp, %rbp -; SKX-NEXT: .cfi_def_cfa_register %rbp -; SKX-NEXT: andq $-64, %rsp -; SKX-NEXT: subq $128, %rsp -; SKX-NEXT: addb %dil, %dil -; SKX-NEXT: vmovaps %zmm0, (%rsp) -; SKX-NEXT: movzbl %dil, %eax -; SKX-NEXT: andl $63, %eax -; SKX-NEXT: movb (%rsp,%rax), %al -; SKX-NEXT: movq %rbp, %rsp -; SKX-NEXT: popq %rbp -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; CHECK-LABEL: test_extractelement_variable_v64i8_indexi8: +; CHECK: ## %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-64, %rsp +; CHECK-NEXT: subq $128, %rsp +; CHECK-NEXT: addb %dil, %dil +; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: andl $63, %eax +; CHECK-NEXT: movb (%rsp,%rax), %al +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %i = add i8 %index, %index %t2 = extractelement <64 x i8> %t1, i8 %i @@ -1772,16 +1694,16 @@ ; KNL-NEXT: andq $-64, %rsp ; KNL-NEXT: subq $128, %rsp ; KNL-NEXT: ## kill: def $esi killed $esi def $rsi -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm2 +; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 -; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 ; KNL-NEXT: andl $63, %esi ; KNL-NEXT: testb %dil, %dil -; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) -; KNL-NEXT: vmovdqa %ymm0, (%rsp) +; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa %ymm2, (%rsp) ; KNL-NEXT: setne (%rsp,%rsi) ; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 @@ -1850,118 +1772,118 @@ ; KNL-NEXT: .cfi_def_cfa_register %rbp ; KNL-NEXT: andq $-128, %rsp ; KNL-NEXT: subq $256, %rsp ## imm = 0x100 -; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0 +; KNL-NEXT: movl 744(%rbp), %eax +; KNL-NEXT: andl $127, %eax +; KNL-NEXT: vmovd %edi, %xmm0 +; KNL-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $4, %r8d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm0, %xmm0 ; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $2, 112(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $3, 120(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $4, 128(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $5, 136(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $6, 144(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $7, 152(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $8, 160(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $9, 168(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $10, 176(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $11, 184(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm1, %xmm1 ; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; KNL-NEXT: vmovd %edi, %xmm2 -; KNL-NEXT: vpinsrb $1, %esi, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $4, %r8d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $5, %r9d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm2, %xmm2 ; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $2, 112(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $3, 120(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $4, 128(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $5, 136(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $6, 144(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $7, 152(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $8, 160(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $9, 168(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $10, 176(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $11, 184(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm3, %xmm3 ; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; KNL-NEXT: movl 744(%rbp), %eax -; KNL-NEXT: andl $127, %eax -; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; KNL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; KNL-NEXT: vpcmpeqb %ymm1, %ymm2, %ymm2 ; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; KNL-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 +; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm4, %xmm4 +; KNL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; KNL-NEXT: vpcmpeqb %ymm1, %ymm3, %ymm1 ; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; KNL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 -; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: cmpb $0, 736(%rbp) -; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) -; KNL-NEXT: vmovdqa %ymm2, (%rsp) +; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa %ymm0, (%rsp) ; KNL-NEXT: setne (%rsp,%rax) ; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 @@ -2159,21 +2081,23 @@ ; KNL-NEXT: andq $-128, %rsp ; KNL-NEXT: subq $256, %rsp ## imm = 0x100 ; KNL-NEXT: ## kill: def $esi killed $esi def $rsi -; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 +; KNL-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm4 +; KNL-NEXT: vpternlogq $15, %zmm4, %zmm4, %zmm4 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 ; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 -; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3 -; KNL-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 ; KNL-NEXT: andl $127, %esi ; KNL-NEXT: testb %dil, %dil -; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp) -; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) -; KNL-NEXT: vmovdqa %ymm0, (%rsp) +; KNL-NEXT: vmovdqa %ymm4, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa %ymm3, (%rsp) ; KNL-NEXT: setne (%rsp,%rsi) ; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll --- a/llvm/test/CodeGen/X86/avx512-logic.ll +++ b/llvm/test/CodeGen/X86/avx512-logic.ll @@ -166,7 +166,7 @@ define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) { ; KNL-LABEL: and_v64i8: ; KNL: ## %bb.0: -; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: and_v64i8: @@ -180,11 +180,7 @@ define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) { ; KNL-LABEL: andn_v64i8: ; KNL: ## %bb.0: -; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; KNL-NEXT: vandnps %ymm2, %ymm3, %ymm2 -; KNL-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; KNL-NEXT: vpandnq %zmm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: andn_v64i8: @@ -202,7 +198,7 @@ define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) { ; KNL-LABEL: or_v64i8: ; KNL: ## %bb.0: -; KNL-NEXT: vpord %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: or_v64i8: @@ -216,7 +212,7 @@ define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) { ; KNL-LABEL: xor_v64i8: ; KNL: ## %bb.0: -; KNL-NEXT: vpxord %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: xor_v64i8: @@ -230,7 +226,7 @@ define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: and_v32i16: ; KNL: ## %bb.0: -; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: and_v32i16: @@ -244,11 +240,7 @@ define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: andn_v32i16: ; KNL: ## %bb.0: -; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; KNL-NEXT: vandnps %ymm2, %ymm3, %ymm2 -; KNL-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; KNL-NEXT: vpandnq %zmm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: andn_v32i16: @@ -264,7 +256,7 @@ define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: or_v32i16: ; KNL: ## %bb.0: -; KNL-NEXT: vpord %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: or_v32i16: @@ -278,7 +270,7 @@ define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: xor_v32i16: ; KNL: ## %bb.0: -; KNL-NEXT: vpxord %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: xor_v32i16: diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1402,17 +1402,15 @@ define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone { ; KNL-LABEL: test21: ; KNL: ## %bb.0: -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; KNL-NEXT: vpsllw $15, %ymm1, %ymm1 ; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 -; KNL-NEXT: vpand %ymm3, %ymm1, %ymm1 ; KNL-NEXT: vpsllw $15, %ymm2, %ymm2 ; KNL-NEXT: vpsraw $15, %ymm2, %ymm2 -; KNL-NEXT: vpand %ymm0, %ymm2, %ymm0 -; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; KNL-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test21: @@ -1431,17 +1429,15 @@ ; ; AVX512DQ-LABEL: test21: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512DQ-NEXT: vpsllw $15, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsraw $15, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsllw $15, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsraw $15, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: test21: @@ -1876,10 +1872,10 @@ define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) { ; KNL-LABEL: test_build_vec_v32i1: ; KNL: ## %bb.0: -; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 +; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_build_vec_v32i1: @@ -1894,10 +1890,10 @@ ; ; AVX512DQ-LABEL: test_build_vec_v32i1: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: test_build_vec_v32i1: @@ -1911,10 +1907,10 @@ define <32 x i16> @test_build_vec_v32i1_optsize(<32 x i16> %x) optsize { ; KNL-LABEL: test_build_vec_v32i1_optsize: ; KNL: ## %bb.0: -; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 +; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_build_vec_v32i1_optsize: @@ -1933,10 +1929,10 @@ ; ; AVX512DQ-LABEL: test_build_vec_v32i1_optsize: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: test_build_vec_v32i1_optsize: @@ -1952,10 +1948,10 @@ define <32 x i16> @test_build_vec_v32i1_pgso(<32 x i16> %x) !prof !14 { ; KNL-LABEL: test_build_vec_v32i1_pgso: ; KNL: ## %bb.0: -; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 +; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_build_vec_v32i1_pgso: @@ -1974,10 +1970,10 @@ ; ; AVX512DQ-LABEL: test_build_vec_v32i1_pgso: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: test_build_vec_v32i1_pgso: @@ -1993,10 +1989,10 @@ define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) { ; KNL-LABEL: test_build_vec_v64i1: ; KNL: ## %bb.0: -; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 +; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_build_vec_v64i1: @@ -2011,10 +2007,10 @@ ; ; AVX512DQ-LABEL: test_build_vec_v64i1: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: test_build_vec_v64i1: @@ -2450,11 +2446,11 @@ ; KNL: ## %bb.0: ; KNL-NEXT: kmovw (%rdi), %k1 ; KNL-NEXT: kmovw 2(%rdi), %k2 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vpmovdw %zmm1, %ymm1 -; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: load_32i1: @@ -2473,11 +2469,11 @@ ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 ; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: load_32i1: @@ -2753,12 +2749,12 @@ define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) { ; KNL-LABEL: store_32i1_1: ; KNL: ## %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: vpmovsxwd %ymm1, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL-NEXT: kmovw %k1, 2(%rdi) ; KNL-NEXT: kmovw %k0, (%rdi) @@ -2783,12 +2779,12 @@ ; ; AVX512DQ-LABEL: store_32i1_1: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm1 +; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 -; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm0 -; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 ; AVX512DQ-NEXT: kmovw %k1, 2(%rdi) ; AVX512DQ-NEXT: kmovw %k0, (%rdi) @@ -4855,29 +4851,31 @@ ; KNL: ## %bb.0: ; KNL-NEXT: pushq %rax ; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm5 -; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm6 -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm7 -; KNL-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; KNL-NEXT: vpcmpeqw %ymm0, %ymm8, %ymm0 -; KNL-NEXT: vpcmpeqw %ymm7, %ymm8, %ymm7 -; KNL-NEXT: vpcmpeqw %ymm1, %ymm8, %ymm1 -; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpcmpeqw %ymm6, %ymm8, %ymm1 -; KNL-NEXT: vpor %ymm1, %ymm7, %ymm1 -; KNL-NEXT: vpcmpeqw %ymm2, %ymm8, %ymm2 -; KNL-NEXT: vpcmpeqw %ymm5, %ymm8, %ymm5 -; KNL-NEXT: vpcmpeqw %ymm3, %ymm8, %ymm3 -; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2 -; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm8, %ymm2 -; KNL-NEXT: vpor %ymm2, %ymm5, %ymm2 -; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; KNL-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4 +; KNL-NEXT: vpcmpeqw %ymm5, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; KNL-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4 +; KNL-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; KNL-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm1 +; KNL-NEXT: vpcmpeqw %ymm5, %ymm2, %ymm2 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; KNL-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; KNL-NEXT: vpcmpeqw %ymm5, %ymm2, %ymm2 +; KNL-NEXT: vpcmpeqw %ymm5, %ymm3, %ymm3 +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; KNL-NEXT: vporq %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpmovsxwd %ymm1, %zmm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx @@ -4941,29 +4939,31 @@ ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: pushq %rax ; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm5 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm6 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm7 -; AVX512DQ-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX512DQ-NEXT: vpcmpeqw %ymm0, %ymm8, %ymm0 -; AVX512DQ-NEXT: vpcmpeqw %ymm7, %ymm8, %ymm7 -; AVX512DQ-NEXT: vpcmpeqw %ymm1, %ymm8, %ymm1 -; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpcmpeqw %ymm6, %ymm8, %ymm1 -; AVX512DQ-NEXT: vpor %ymm1, %ymm7, %ymm1 -; AVX512DQ-NEXT: vpcmpeqw %ymm2, %ymm8, %ymm2 -; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm8, %ymm5 -; AVX512DQ-NEXT: vpcmpeqw %ymm3, %ymm8, %ymm3 -; AVX512DQ-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpcmpeqw %ymm4, %ymm8, %ymm2 -; AVX512DQ-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm3, %ymm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vporq %zmm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: kmovw %k0, %eax -; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: kmovw %k0, %ecx ; AVX512DQ-NEXT: shll $16, %ecx @@ -5024,57 +5024,31 @@ ; KNL: ## %bb.0: ; KNL-NEXT: pushq %rax ; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: vextracti64x4 $1, %zmm3, %ymm9 -; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm10 -; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm11 -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm7 -; KNL-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; KNL-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm13 -; KNL-NEXT: vextracti128 $1, %ymm13, %xmm4 -; KNL-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7 -; KNL-NEXT: vextracti128 $1, %ymm7, %xmm5 -; KNL-NEXT: vpcmpeqb %ymm1, %ymm8, %ymm1 -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm6 -; KNL-NEXT: vpor %xmm6, %xmm4, %xmm12 -; KNL-NEXT: vpcmpeqb %ymm8, %ymm11, %ymm6 -; KNL-NEXT: vextracti128 $1, %ymm6, %xmm4 -; KNL-NEXT: vpor %xmm4, %xmm5, %xmm11 -; KNL-NEXT: vpcmpeqb %ymm2, %ymm8, %ymm2 -; KNL-NEXT: vextracti128 $1, %ymm2, %xmm5 -; KNL-NEXT: vpcmpeqb %ymm8, %ymm10, %ymm10 -; KNL-NEXT: vextracti128 $1, %ymm10, %xmm4 -; KNL-NEXT: vpcmpeqb %ymm3, %ymm8, %ymm3 -; KNL-NEXT: vextracti128 $1, %ymm3, %xmm0 -; KNL-NEXT: vpor %xmm0, %xmm5, %xmm0 -; KNL-NEXT: vpand %xmm0, %xmm12, %xmm12 -; KNL-NEXT: vpcmpeqb %ymm8, %ymm9, %ymm5 -; KNL-NEXT: vextracti128 $1, %ymm5, %xmm0 -; KNL-NEXT: vpor %xmm0, %xmm4, %xmm0 -; KNL-NEXT: vpand %xmm0, %xmm11, %xmm0 -; KNL-NEXT: vpor %xmm6, %xmm7, %xmm4 -; KNL-NEXT: vpor %xmm1, %xmm13, %xmm1 -; KNL-NEXT: vpor %xmm5, %xmm10, %xmm5 -; KNL-NEXT: vpand %xmm5, %xmm4, %xmm4 -; KNL-NEXT: vpor %xmm3, %xmm2, %xmm2 -; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1 -; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpmovsxbd %xmm12, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: shll $16, %ecx -; KNL-NEXT: orl %eax, %ecx -; KNL-NEXT: vpmovsxbd %xmm4, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %edx -; KNL-NEXT: shll $16, %edx -; KNL-NEXT: orl %eax, %edx -; KNL-NEXT: shlq $32, %rdx -; KNL-NEXT: orq %rcx, %rdx +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; KNL-NEXT: vpcmpeqb %ymm5, %ymm4, %ymm4 +; KNL-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; KNL-NEXT: vpcmpeqb %ymm5, %ymm4, %ymm4 +; KNL-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; KNL-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm1 +; KNL-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm2 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; KNL-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; KNL-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm2 +; KNL-NEXT: vpcmpeqb %ymm5, %ymm3, %ymm3 +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; KNL-NEXT: vporq %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpmovmskb %ymm0, %eax +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpmovmskb %ymm0, %ecx +; KNL-NEXT: shlq $32, %rcx +; KNL-NEXT: orq %rax, %rcx ; KNL-NEXT: je LBB78_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax @@ -5134,57 +5108,31 @@ ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: pushq %rax ; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm9 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm10 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm11 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm7 -; AVX512DQ-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX512DQ-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm13 -; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm4 -; AVX512DQ-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7 -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm5 -; AVX512DQ-NEXT: vpcmpeqb %ymm1, %ymm8, %ymm1 -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512DQ-NEXT: vpor %xmm6, %xmm4, %xmm12 -; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm11, %ymm6 -; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm4 -; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm11 -; AVX512DQ-NEXT: vpcmpeqb %ymm2, %ymm8, %ymm2 -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm10, %ymm10 -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm4 -; AVX512DQ-NEXT: vpcmpeqb %ymm3, %ymm8, %ymm3 -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX512DQ-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX512DQ-NEXT: vpand %xmm0, %xmm12, %xmm12 -; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm9, %ymm5 -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm0 -; AVX512DQ-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX512DQ-NEXT: vpand %xmm0, %xmm11, %xmm0 -; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm4 -; AVX512DQ-NEXT: vpor %xmm1, %xmm13, %xmm1 -; AVX512DQ-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX512DQ-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: kmovw %k0, %eax -; AVX512DQ-NEXT: vpmovsxbd %xmm12, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: kmovw %k0, %ecx -; AVX512DQ-NEXT: shll $16, %ecx -; AVX512DQ-NEXT: orl %eax, %ecx -; AVX512DQ-NEXT: vpmovsxbd %xmm4, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: kmovw %k0, %eax -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 -; AVX512DQ-NEXT: kmovw %k0, %edx -; AVX512DQ-NEXT: shll $16, %edx -; AVX512DQ-NEXT: orl %eax, %edx -; AVX512DQ-NEXT: shlq $32, %rdx -; AVX512DQ-NEXT: orq %rcx, %rdx +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm3, %ymm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vporq %zmm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovmskb %ymm0, %eax +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovmskb %ymm0, %ecx +; AVX512DQ-NEXT: shlq $32, %rcx +; AVX512DQ-NEXT: orq %rax, %rcx ; AVX512DQ-NEXT: je LBB78_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -434,101 +434,45 @@ } define <32 x i16> @pr42355_v32i16(i1 %c, <32 x i16> %x, <32 x i16> %y) { -; X86-AVX512F-LABEL: pr42355_v32i16: -; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: testb $1, {{[0-9]+}}(%esp) -; X86-AVX512F-NEXT: jne .LBB14_1 -; X86-AVX512F-NEXT: # %bb.2: -; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; X86-AVX512F-NEXT: vmovaps %ymm1, %ymm0 -; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; X86-AVX512F-NEXT: retl -; X86-AVX512F-NEXT: .LBB14_1: -; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; X86-AVX512F-NEXT: retl -; -; X64-AVX512F-LABEL: pr42355_v32i16: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: testb $1, %dil -; X64-AVX512F-NEXT: jne .LBB14_1 -; X64-AVX512F-NEXT: # %bb.2: -; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; X64-AVX512F-NEXT: vmovaps %ymm1, %ymm0 -; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; X64-AVX512F-NEXT: retq -; X64-AVX512F-NEXT: .LBB14_1: -; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; X64-AVX512F-NEXT: retq -; -; X86-AVX512BW-LABEL: pr42355_v32i16: -; X86-AVX512BW: # %bb.0: -; X86-AVX512BW-NEXT: testb $1, {{[0-9]+}}(%esp) -; X86-AVX512BW-NEXT: jne .LBB14_2 -; X86-AVX512BW-NEXT: # %bb.1: -; X86-AVX512BW-NEXT: vmovaps %zmm1, %zmm0 -; X86-AVX512BW-NEXT: .LBB14_2: -; X86-AVX512BW-NEXT: retl +; X86-LABEL: pr42355_v32i16: +; X86: # %bb.0: +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: jne .LBB14_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: vmovaps %zmm1, %zmm0 +; X86-NEXT: .LBB14_2: +; X86-NEXT: retl ; -; X64-AVX512BW-LABEL: pr42355_v32i16: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: testb $1, %dil -; X64-AVX512BW-NEXT: jne .LBB14_2 -; X64-AVX512BW-NEXT: # %bb.1: -; X64-AVX512BW-NEXT: vmovaps %zmm1, %zmm0 -; X64-AVX512BW-NEXT: .LBB14_2: -; X64-AVX512BW-NEXT: retq +; X64-LABEL: pr42355_v32i16: +; X64: # %bb.0: +; X64-NEXT: testb $1, %dil +; X64-NEXT: jne .LBB14_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: vmovaps %zmm1, %zmm0 +; X64-NEXT: .LBB14_2: +; X64-NEXT: retq %a = select i1 %c, <32 x i16> %x, <32 x i16> %y ret <32 x i16> %a } define <64 x i8> @pr42355_v64i8(i1 %c, <64 x i8> %x, <64 x i8> %y) { -; X86-AVX512F-LABEL: pr42355_v64i8: -; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: testb $1, {{[0-9]+}}(%esp) -; X86-AVX512F-NEXT: jne .LBB15_1 -; X86-AVX512F-NEXT: # %bb.2: -; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; X86-AVX512F-NEXT: vmovaps %ymm1, %ymm0 -; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; X86-AVX512F-NEXT: retl -; X86-AVX512F-NEXT: .LBB15_1: -; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; X86-AVX512F-NEXT: retl -; -; X64-AVX512F-LABEL: pr42355_v64i8: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: testb $1, %dil -; X64-AVX512F-NEXT: jne .LBB15_1 -; X64-AVX512F-NEXT: # %bb.2: -; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; X64-AVX512F-NEXT: vmovaps %ymm1, %ymm0 -; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; X64-AVX512F-NEXT: retq -; X64-AVX512F-NEXT: .LBB15_1: -; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; X64-AVX512F-NEXT: retq -; -; X86-AVX512BW-LABEL: pr42355_v64i8: -; X86-AVX512BW: # %bb.0: -; X86-AVX512BW-NEXT: testb $1, {{[0-9]+}}(%esp) -; X86-AVX512BW-NEXT: jne .LBB15_2 -; X86-AVX512BW-NEXT: # %bb.1: -; X86-AVX512BW-NEXT: vmovaps %zmm1, %zmm0 -; X86-AVX512BW-NEXT: .LBB15_2: -; X86-AVX512BW-NEXT: retl +; X86-LABEL: pr42355_v64i8: +; X86: # %bb.0: +; X86-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-NEXT: jne .LBB15_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: vmovaps %zmm1, %zmm0 +; X86-NEXT: .LBB15_2: +; X86-NEXT: retl ; -; X64-AVX512BW-LABEL: pr42355_v64i8: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: testb $1, %dil -; X64-AVX512BW-NEXT: jne .LBB15_2 -; X64-AVX512BW-NEXT: # %bb.1: -; X64-AVX512BW-NEXT: vmovaps %zmm1, %zmm0 -; X64-AVX512BW-NEXT: .LBB15_2: -; X64-AVX512BW-NEXT: retq +; X64-LABEL: pr42355_v64i8: +; X64: # %bb.0: +; X64-NEXT: testb $1, %dil +; X64-NEXT: jne .LBB15_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: vmovaps %zmm1, %zmm0 +; X64-NEXT: .LBB15_2: +; X64-NEXT: retq %a = select i1 %c, <64 x i8> %x, <64 x i8> %y ret <64 x i8> %a } diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll --- a/llvm/test/CodeGen/X86/avx512-trunc.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc.ll @@ -454,12 +454,12 @@ define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 { ; KNL-LABEL: trunc_wb_512: ; KNL: ## %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; KNL-NEXT: vpmovdb %zmm1, %xmm1 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_wb_512: diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll --- a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll @@ -134,10 +134,11 @@ define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_8i16_32i16: ; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: test_broadcast_8i16_32i16: @@ -148,10 +149,11 @@ ; ; X64-AVX512DQVL-LABEL: test_broadcast_8i16_32i16: ; X64-AVX512DQVL: ## %bb.0: -; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512DQVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X64-AVX512DQVL-NEXT: retq %1 = load <8 x i16>, <8 x i16> *%p %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> @@ -162,10 +164,11 @@ define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_16i8_64i8: ; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: test_broadcast_16i8_64i8: @@ -176,10 +179,11 @@ ; ; X64-AVX512DQVL-LABEL: test_broadcast_16i8_64i8: ; X64-AVX512DQVL: ## %bb.0: -; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512DQVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X64-AVX512DQVL-NEXT: retq %1 = load <16 x i8>, <16 x i8> *%p %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll b/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll --- a/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll @@ -54,10 +54,11 @@ define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_16i16_32i16: ; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; X64-AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: test_broadcast_16i16_32i16: @@ -68,10 +69,11 @@ ; ; X64-AVX512DQVL-LABEL: test_broadcast_16i16_32i16: ; X64-AVX512DQVL: ## %bb.0: -; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm0 +; X64-AVX512DQVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X64-AVX512DQVL-NEXT: retq %1 = load <16 x i16>, <16 x i16> *%p %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> @@ -82,10 +84,11 @@ define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_32i8_64i8: ; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; X64-AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: test_broadcast_32i8_64i8: @@ -96,10 +99,11 @@ ; ; X64-AVX512DQVL-LABEL: test_broadcast_32i8_64i8: ; X64-AVX512DQVL: ## %bb.0: -; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm0 +; X64-AVX512DQVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; X64-AVX512DQVL-NEXT: retq %1 = load <32 x i8>, <32 x i8> *%p %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -245,21 +245,25 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind { ; KNL-LABEL: test12_v64i16: ; KNL: ## %bb.0: -; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x75,0xc4] -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xc0] -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0] +; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm4 ## encoding: [0xc5,0xfd,0x75,0xe2] +; KNL-NEXT: vpmovsxwd %ymm4, %zmm4 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xe4] +; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 ## encoding: [0x62,0xf2,0x5d,0x48,0x27,0xc4] ; KNL-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; KNL-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm0 ## encoding: [0xc5,0xf5,0x75,0xc5] +; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xd2,0x01] +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc0,0x01] +; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x75,0xc2] ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xc0] ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0] ; KNL-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; KNL-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] ; KNL-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] -; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0 ## encoding: [0xc5,0xed,0x75,0xc6] +; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm0 ## encoding: [0xc5,0xf5,0x75,0xc3] ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xc0] ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0] ; KNL-NEXT: kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0] -; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm0 ## encoding: [0xc5,0xe5,0x75,0xc7] +; KNL-NEXT: vextracti64x4 $1, %zmm3, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xd8,0x01] +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ## encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc9,0x01] +; KNL-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0 ## encoding: [0xc5,0xf5,0x75,0xc0] ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xc0] ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0] ; KNL-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] diff --git a/llvm/test/CodeGen/X86/avx512-vselect.ll b/llvm/test/CodeGen/X86/avx512-vselect.ll --- a/llvm/test/CodeGen/X86/avx512-vselect.ll +++ b/llvm/test/CodeGen/X86/avx512-vselect.ll @@ -173,31 +173,20 @@ ; ; CHECK-KNL-LABEL: test8: ; CHECK-KNL: # %bb.0: -; CHECK-KNL-NEXT: pushq %rbp -; CHECK-KNL-NEXT: .cfi_def_cfa_offset 16 -; CHECK-KNL-NEXT: .cfi_offset %rbp, -16 -; CHECK-KNL-NEXT: movq %rsp, %rbp -; CHECK-KNL-NEXT: .cfi_def_cfa_register %rbp -; CHECK-KNL-NEXT: andq $-32, %rsp -; CHECK-KNL-NEXT: subq $32, %rsp -; CHECK-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm8 -; CHECK-KNL-NEXT: vmovdqa 16(%rbp), %ymm9 -; CHECK-KNL-NEXT: vpxor %xmm10, %xmm10, %xmm10 -; CHECK-KNL-NEXT: vpcmpeqb %ymm0, %ymm10, %ymm11 -; CHECK-KNL-NEXT: vpmovsxbw %xmm11, %ymm0 -; CHECK-KNL-NEXT: vpblendvb %ymm0, %ymm1, %ymm5, %ymm0 -; CHECK-KNL-NEXT: vextracti128 $1, %ymm11, %xmm1 -; CHECK-KNL-NEXT: vpmovsxbw %xmm1, %ymm1 -; CHECK-KNL-NEXT: vpblendvb %ymm1, %ymm2, %ymm6, %ymm1 -; CHECK-KNL-NEXT: vpcmpeqb %ymm10, %ymm8, %ymm5 -; CHECK-KNL-NEXT: vpmovsxbw %xmm5, %ymm2 -; CHECK-KNL-NEXT: vpblendvb %ymm2, %ymm3, %ymm7, %ymm2 +; CHECK-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; CHECK-KNL-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; CHECK-KNL-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5 +; CHECK-KNL-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0 +; CHECK-KNL-NEXT: vpmovsxbw %xmm0, %ymm6 +; CHECK-KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-KNL-NEXT: vpmovsxbw %xmm0, %ymm0 +; CHECK-KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; CHECK-KNL-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0 +; CHECK-KNL-NEXT: vpmovsxbw %xmm5, %ymm1 ; CHECK-KNL-NEXT: vextracti128 $1, %ymm5, %xmm3 ; CHECK-KNL-NEXT: vpmovsxbw %xmm3, %ymm3 -; CHECK-KNL-NEXT: vpblendvb %ymm3, %ymm4, %ymm9, %ymm3 -; CHECK-KNL-NEXT: movq %rbp, %rsp -; CHECK-KNL-NEXT: popq %rbp -; CHECK-KNL-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; CHECK-KNL-NEXT: vpternlogq $202, %zmm4, %zmm2, %zmm1 ; CHECK-KNL-NEXT: retq %c = icmp eq <64 x i8> %x, zeroinitializer %ret = select <64 x i1> %c, <64 x i16> %a, <64 x i16> %b diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll --- a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -918,15 +918,15 @@ ; ; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, %ecx +; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax ; NoVLX-NEXT: orl %ecx, %eax @@ -951,12 +951,12 @@ ; ; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 -; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm1, %ymm0 +; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -991,8 +991,8 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: shrl $16, %edi -; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -5722,15 +5722,15 @@ ; ; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, %ecx +; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax ; NoVLX-NEXT: orl %ecx, %eax @@ -5755,12 +5755,12 @@ ; ; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 -; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm1, %ymm0 +; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -5795,8 +5795,8 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: shrl $16, %edi -; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -10574,17 +10574,17 @@ ; ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 +; NoVLX-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, %ecx +; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax ; NoVLX-NEXT: orl %ecx, %eax @@ -10609,18 +10609,18 @@ ; ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; NoVLX-NEXT: vmovdqa (%rdi), %ymm2 -; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm3 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm1 +; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, %ecx +; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax ; NoVLX-NEXT: orl %ecx, %eax @@ -10697,8 +10697,8 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: shrl $16, %edi -; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm1 +; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 @@ -15454,19 +15454,19 @@ ; ; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm2 +; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 +; NoVLX-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, %ecx +; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: vpmaxuw %ymm3, %ymm2, %ymm0 -; NoVLX-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax ; NoVLX-NEXT: orl %ecx, %eax @@ -15491,15 +15491,15 @@ ; ; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; NoVLX-NEXT: vpmaxuw (%rdi), %ymm0, %ymm2 -; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 -; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpmaxuw (%rdi), %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm1 +; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: vpmaxuw 32(%rdi), %ymm1, %ymm0 -; NoVLX-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; NoVLX-NEXT: vpmaxuw 32(%rdi), %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -15537,8 +15537,8 @@ ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: shrl $16, %edi -; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; NoVLX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll @@ -266,20 +266,20 @@ ; ; AVX512F-LABEL: v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 ; AVX512F-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm6 -; AVX512F-NEXT: vpcmpgtw %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm4 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm1 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %ecx -; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm0 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: shll $16, %eax @@ -558,38 +558,18 @@ ; ; AVX512F-LABEL: v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 ; AVX512F-NEXT: vpcmpgtb %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm6 -; AVX512F-NEXT: vpcmpgtb %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512F-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %ecx -; AVX512F-NEXT: shll $16, %ecx -; AVX512F-NEXT: orl %eax, %ecx -; AVX512F-NEXT: vpand %xmm4, %xmm5, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %edx -; AVX512F-NEXT: vpmovsxbd %xmm6, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: shll $16, %eax -; AVX512F-NEXT: orl %edx, %eax +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovmskb %ymm0, %ecx +; AVX512F-NEXT: vpmovmskb %ymm1, %eax ; AVX512F-NEXT: shlq $32, %rax ; AVX512F-NEXT: orq %rcx, %rax ; AVX512F-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -777,13 +777,13 @@ ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: shrl $16, %edi ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i32_32i16: diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll @@ -51,15 +51,15 @@ ; ; AVX512F-LABEL: v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-NEXT: kmovw %k0, %ecx +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %ecx -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: shll $16, %eax ; AVX512F-NEXT: orl %ecx, %eax @@ -450,15 +450,15 @@ ; ; AVX512F-LABEL: bitcast_64i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm3 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm2 +; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm3 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -520,12 +520,12 @@ ; ; AVX512F-LABEL: bitcast_32i16_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 +; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: kmovw %k1, 2(%rdi) diff --git a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll --- a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll +++ b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll @@ -889,29 +889,11 @@ ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512VL-LABEL: test_nt64xi8: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512VL-NEXT: vmovntdq %ymm0, (%rdi) -; AVX512VL-NEXT: vmovntdq %ymm1, 32(%rdi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512F-LABEL: test_nt64xi8: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512F-NEXT: vmovntdq %ymm0, (%rdi) -; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: test_nt64xi8: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_nt64xi8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovntdq %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: store <64 x i8> %X, <64 x i8>* %ptr, align 64, !nontemporal !1 ret void @@ -933,29 +915,11 @@ ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512VL-LABEL: test_nt32xi16: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512VL-NEXT: vmovntdq %ymm0, (%rdi) -; AVX512VL-NEXT: vmovntdq %ymm1, 32(%rdi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512F-LABEL: test_nt32xi16: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512F-NEXT: vmovntdq %ymm0, (%rdi) -; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: test_nt32xi16: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_nt32xi16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovntdq %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: store <32 x i16> %X, <32 x i16>* %ptr, align 64, !nontemporal !1 ret void diff --git a/llvm/test/CodeGen/X86/kshift.ll b/llvm/test/CodeGen/X86/kshift.ll --- a/llvm/test/CodeGen/X86/kshift.ll +++ b/llvm/test/CodeGen/X86/kshift.ll @@ -61,23 +61,23 @@ define i32 @kshiftl_v32i1_1(<32 x i16> %x, <32 x i16> %y) { ; KNL-LABEL: kshiftl_v32i1_1: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm3 -; KNL-NEXT: vpmovsxwd %ymm3, %zmm3 -; KNL-NEXT: vptestmd %zmm3, %zmm3, %k1 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2 +; KNL-NEXT: vpmovsxwd %ymm2, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1 +; KNL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[15],zmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[15],zmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] ; KNL-NEXT: kshiftlw $1, %k2, %k1 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm2 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; KNL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2 ; KNL-NEXT: vpmovsxwd %ymm2, %zmm2 ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm1 +; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 ; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %ecx @@ -107,16 +107,15 @@ define i64 @kshiftl_v64i1_1(<64 x i8> %x, <64 x i8> %y) { ; KNL-LABEL: kshiftl_v64i1_1: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm5 -; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm0 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 +; KNL-NEXT: vpmovsxbd %xmm3, %zmm4 +; KNL-NEXT: vptestmd %zmm4, %zmm4, %k1 +; KNL-NEXT: vextracti128 $1, %ymm3, %xmm3 +; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 +; KNL-NEXT: vptestmd %zmm3, %zmm3, %k2 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm3 ; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 ; KNL-NEXT: vptestmd %zmm3, %zmm3, %k3 @@ -125,25 +124,26 @@ ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} ; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} ; KNL-NEXT: valignd {{.*#+}} zmm3 = zmm0[15],zmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} -; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm5[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} -; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm6[15],zmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} +; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm4[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} +; KNL-NEXT: valignd {{.*#+}} zmm4 = zmm5[15],zmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] ; KNL-NEXT: kshiftlw $1, %k1, %k3 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 -; KNL-NEXT: vextracti128 $1, %ymm2, %xmm6 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm5 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm5, %ymm5 +; KNL-NEXT: vextracti128 $1, %ymm5, %xmm6 ; KNL-NEXT: vpmovsxbd %xmm6, %zmm6 ; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1 -; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1 +; KNL-NEXT: vpmovsxbd %xmm5, %zmm5 +; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 ; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 ; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k4 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k3} ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vptestmd %zmm5, %zmm5, %k0 {%k4} +; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 {%k4} ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: orl %eax, %ecx @@ -233,13 +233,13 @@ define i32 @kshiftl_v32i1_31(<32 x i16> %x, <32 x i16> %y) { ; KNL-LABEL: kshiftl_v32i1_31: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k1 -; KNL-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax @@ -266,13 +266,13 @@ define i64 @kshiftl_v64i1_63(<64 x i8> %x, <64 x i8> %y) { ; KNL-LABEL: kshiftl_v64i1_63: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k1 -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} @@ -358,23 +358,23 @@ define i32 @kshiftr_v32i1_1(<32 x i16> %x, <32 x i16> %y) { ; KNL-LABEL: kshiftr_v32i1_1: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm3 -; KNL-NEXT: vpmovsxwd %ymm3, %zmm3 -; KNL-NEXT: vptestmd %zmm3, %zmm3, %k1 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2 +; KNL-NEXT: vpmovsxwd %ymm2, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1 +; KNL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm3[0] +; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm2[0] ; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm1 -; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm1 +; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm2 +; KNL-NEXT: vpmovsxwd %ymm2, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 ; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %ecx @@ -404,16 +404,15 @@ define i64 @kshiftr_v64i1_1(<64 x i8> %x, <64 x i8> %y) { ; KNL-LABEL: kshiftr_v64i1_1: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3 -; KNL-NEXT: vextracti128 $1, %ymm3, %xmm5 -; KNL-NEXT: vpmovsxbd %xmm5, %zmm5 -; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm5 -; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm3 +; KNL-NEXT: vextracti128 $1, %ymm3, %xmm4 +; KNL-NEXT: vpmovsxbd %xmm4, %zmm4 +; KNL-NEXT: vptestmd %zmm4, %zmm4, %k1 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpmovsxbd %xmm0, %zmm4 +; KNL-NEXT: vptestmd %zmm4, %zmm4, %k2 ; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 ; KNL-NEXT: vptestmd %zmm3, %zmm3, %k3 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -421,19 +420,20 @@ ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} ; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} -; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm3[0] -; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} -; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm6[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] -; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} -; KNL-NEXT: valignd {{.*#+}} zmm3 = zmm3[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm6[0] +; KNL-NEXT: valignd {{.*#+}} zmm4 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm3[0] +; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} +; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm5[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] +; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} +; KNL-NEXT: valignd {{.*#+}} zmm3 = zmm3[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm5[0] ; KNL-NEXT: kshiftrw $1, %k1, %k3 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1 -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm6 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm5 +; KNL-NEXT: vextracti128 $1, %ymm5, %xmm6 ; KNL-NEXT: vpmovsxbd %xmm6, %zmm6 ; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1 -; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm1 +; KNL-NEXT: vpmovsxbd %xmm5, %zmm5 +; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm2 ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k4 ; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 @@ -447,7 +447,7 @@ ; KNL-NEXT: shlq $32, %rcx ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k2} ; KNL-NEXT: kmovw %k0, %edx -; KNL-NEXT: vptestmd %zmm5, %zmm5, %k0 {%k1} +; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: shll $16, %eax ; KNL-NEXT: orl %edx, %eax diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -2253,12 +2253,16 @@ ; ; AVX512F-LABEL: jumbled_indices32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmaddwd %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaddwd %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmaddwd %ymm7, %ymm3, %ymm1 -; AVX512F-NEXT: vpmaddwd %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512F-NEXT: vpmaddwd %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vpmaddwd %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: jumbled_indices32: diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -5373,14 +5373,14 @@ ; ; AVX512F-LABEL: truncstore_v32i16_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512F-NEXT: vpmovmskb %ymm1, %eax ; AVX512F-NEXT: notl %eax ; AVX512F-NEXT: testb $1, %al diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -6140,9 +6140,9 @@ ; ; AVX512F-LABEL: truncstore_v32i16_v32i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpminuw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpminuw %ymm3, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -460,22 +460,15 @@ } define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(i16* %ptr) nounwind uwtable noinline ssp { -; AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovaps %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: retq +; ALL-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: +; ALL: # %bb.0: +; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 1 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 2 @@ -515,22 +508,15 @@ } define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp { -; AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-NEXT: vmovaps %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512BW-NEXT: retq +; ALL-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: +; ALL: # %bb.0: +; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3 @@ -547,22 +533,15 @@ } define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp { -; AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovaps %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: retq +; ALL-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: +; ALL: # %bb.0: +; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2 @@ -585,22 +564,15 @@ } define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp { -; AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-NEXT: vmovaps %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512BW-NEXT: retq +; ALL-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: +; ALL: # %bb.0: +; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -318,10 +318,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm6 -; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm6 ; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm7 ; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 @@ -329,9 +326,12 @@ ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubw %ymm7, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -342,10 +342,7 @@ ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 @@ -353,9 +350,12 @@ ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -393,20 +393,20 @@ ; AVX512F-NEXT: vpminuw %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5 ; AVX512F-NEXT: vpternlogq $15, %zmm5, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm7 -; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm0, %ymm8 -; AVX512F-NEXT: vpternlogq $15, %zmm8, %zmm8, %zmm8 -; AVX512F-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm6 +; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7 +; AVX512F-NEXT: vpternlogq $15, %zmm7, %zmm7, %zmm7 ; AVX512F-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubw %ymm7, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpsubw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm6 ; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -419,20 +419,20 @@ ; AVX512VL-FALLBACK-NEXT: vpminuw %ymm2, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm5, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm7, %ymm0, %ymm8 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm8, %ymm8, %ymm8 -; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7 +; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm7, %ymm7, %ymm7 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm4, %ymm7, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -471,10 +471,7 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm6 -; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 ; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm6 ; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm7 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 @@ -482,9 +479,12 @@ ; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vpsubw %ymm7, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -496,10 +496,7 @@ ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm7 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 @@ -507,9 +504,12 @@ ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -544,52 +544,52 @@ define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, <32 x i16>* %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i16_signed_reg_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6 -; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm6 -; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm7 -; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubw %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsubw %ymm7, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm6 +; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm7 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpsubw %ymm7, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm7 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_signed_reg_mem: @@ -621,53 +621,53 @@ define <32 x i16> @vec512_i16_signed_mem_mem(<32 x i16>* %a1_addr, <32 x i16>* %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i16_signed_mem_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6 -; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm6 -; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm7 -; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubw %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsubw %ymm7, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 +; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm6 +; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm7 +; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpsubw %ymm7, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm7 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -711,10 +711,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm6 -; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6 ; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm7 ; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 @@ -727,23 +724,26 @@ ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] -; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] ; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm8, %ymm1 ; AVX512F-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -755,10 +755,7 @@ ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 @@ -771,23 +768,26 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm1, %ymm8, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -831,45 +831,45 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_unsigned_reg_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vpminub %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 -; AVX512F-NEXT: vpternlogq $15, %zmm5, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm7 -; AVX512F-NEXT: vpcmpeqb %ymm7, %ymm0, %ymm8 -; AVX512F-NEXT: vpternlogq $15, %zmm8, %zmm8, %zmm8 -; AVX512F-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512F-NEXT: vpmaxub %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpminub %ymm4, %ymm2, %ymm5 +; AVX512F-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm3 +; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 +; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm6 +; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 +; AVX512F-NEXT: vpternlogq $15, %zmm7, %zmm7, %zmm7 +; AVX512F-NEXT: vpmaxub %ymm4, %ymm2, %ymm4 ; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubb %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15],ymm6[24],ymm0[24],ymm6[25],ymm0[25],ymm6[26],ymm0[26],ymm6[27],ymm0[27],ymm6[28],ymm0[28],ymm6[29],ymm0[29],ymm6[30],ymm0[30],ymm6[31],ymm0[31] -; AVX512F-NEXT: vpmullw %ymm7, %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm6, %ymm7, %ymm7 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm7[8],ymm0[8],ymm7[9],ymm0[9],ymm7[10],ymm0[10],ymm7[11],ymm0[11],ymm7[12],ymm0[12],ymm7[13],ymm0[13],ymm7[14],ymm0[14],ymm7[15],ymm0[15],ymm7[24],ymm0[24],ymm7[25],ymm0[25],ymm7[26],ymm0[26],ymm7[27],ymm0[27],ymm7[28],ymm0[28],ymm7[29],ymm0[29],ymm7[30],ymm0[30],ymm7[31],ymm0[31] +; AVX512F-NEXT: vpmullw %ymm5, %ymm8, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[16],ymm0[16],ymm6[17],ymm0[17],ymm6[18],ymm0[18],ymm6[19],ymm0[19],ymm6[20],ymm0[20],ymm6[21],ymm0[21],ymm6[22],ymm0[22],ymm6[23],ymm0[23] -; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] -; AVX512F-NEXT: vpmullw %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[4],ymm0[4],ymm7[5],ymm0[5],ymm7[6],ymm0[6],ymm7[7],ymm0[7],ymm7[16],ymm0[16],ymm7[17],ymm0[17],ymm7[18],ymm0[18],ymm7[19],ymm0[19],ymm7[20],ymm0[20],ymm7[21],ymm0[21],ymm7[22],ymm0[22],ymm7[23],ymm0[23] +; AVX512F-NEXT: vpmullw %ymm7, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm8, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] +; AVX512F-NEXT: vpor %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] -; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -882,15 +882,12 @@ ; AVX512VL-FALLBACK-NEXT: vpminub %ymm3, %ymm2, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm5, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm7, %ymm0, %ymm8 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm8, %ymm8, %ymm8 -; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 +; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm7, %ymm7, %ymm7 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm3, %ymm2, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] @@ -898,23 +895,26 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15],ymm6[24],ymm0[24],ymm6[25],ymm0[25],ymm6[26],ymm0[26],ymm6[27],ymm0[27],ymm6[28],ymm0[28],ymm6[29],ymm0[29],ymm6[30],ymm0[30],ymm6[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm7, %ymm7 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm7[8],ymm0[8],ymm7[9],ymm0[9],ymm7[10],ymm0[10],ymm7[11],ymm0[11],ymm7[12],ymm0[12],ymm7[13],ymm0[13],ymm7[14],ymm0[14],ymm7[15],ymm0[15],ymm7[24],ymm0[24],ymm7[25],ymm0[25],ymm7[26],ymm0[26],ymm7[27],ymm0[27],ymm7[28],ymm0[28],ymm7[29],ymm0[29],ymm7[30],ymm0[30],ymm7[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm8, %ymm4 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm8, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[16],ymm0[16],ymm6[17],ymm0[17],ymm6[18],ymm0[18],ymm6[19],ymm0[19],ymm6[20],ymm0[20],ymm6[21],ymm0[21],ymm6[22],ymm0[22],ymm6[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[4],ymm0[4],ymm7[5],ymm0[5],ymm7[6],ymm0[6],ymm7[7],ymm0[7],ymm7[16],ymm0[16],ymm7[17],ymm0[17],ymm7[18],ymm0[18],ymm7[19],ymm0[19],ymm7[20],ymm0[20],ymm7[21],ymm0[21],ymm7[22],ymm0[22],ymm7[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm1, %ymm8, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm8, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -964,10 +964,7 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm6 -; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 ; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6 ; AVX512F-NEXT: vpminsb %ymm0, %ymm1, %ymm7 ; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 @@ -980,23 +977,26 @@ ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] -; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] ; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm7, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm0, %ymm8, %ymm0 ; AVX512F-NEXT: vpackuswb %ymm6, %ymm0, %ymm0 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 @@ -1009,10 +1009,7 @@ ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm7 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 @@ -1025,23 +1022,26 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpand %ymm0, %ymm8, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 @@ -1087,14 +1087,11 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, <64 x i8>* %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_reg_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6 -; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 ; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6 ; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm7 ; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 @@ -1107,23 +1104,26 @@ ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] -; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] ; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm8, %ymm2 ; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 @@ -1132,14 +1132,11 @@ ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 @@ -1152,23 +1149,26 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm2, %ymm8, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 @@ -1214,15 +1214,12 @@ define <64 x i8> @vec512_i8_signed_mem_mem(<64 x i8>* %a1_addr, <64 x i8>* %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_mem_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6 -; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 ; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6 ; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm7 ; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 @@ -1235,23 +1232,26 @@ ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] -; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] ; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm8, %ymm2 ; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 @@ -1260,15 +1260,12 @@ ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 @@ -1281,23 +1278,26 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm2, %ymm8, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -543,13 +543,13 @@ ; ; KNL-LABEL: allones_v32i16_sign: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 +; KNL-NEXT: vpmovsxwd %ymm2, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %ecx @@ -611,13 +611,13 @@ ; ; KNL-LABEL: allzeros_v32i16_sign: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 +; KNL-NEXT: vpmovsxwd %ymm2, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %ecx @@ -1381,12 +1381,12 @@ ; ; KNL-LABEL: allones_v64i8_and1: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpsllw $7, %ymm0, %ymm1 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 -; KNL-NEXT: vpsllw $7, %ymm1, %ymm1 -; KNL-NEXT: vpmovmskb %ymm1, %eax +; KNL-NEXT: vpmovmskb %ymm0, %eax ; KNL-NEXT: shlq $32, %rax -; KNL-NEXT: vpmovmskb %ymm0, %ecx +; KNL-NEXT: vpmovmskb %ymm1, %ecx ; KNL-NEXT: orq %rax, %rcx ; KNL-NEXT: cmpq $-1, %rcx ; KNL-NEXT: sete %al @@ -1463,12 +1463,12 @@ ; ; KNL-LABEL: allzeros_v64i8_and1: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpsllw $7, %ymm0, %ymm1 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 -; KNL-NEXT: vpsllw $7, %ymm1, %ymm1 -; KNL-NEXT: vpmovmskb %ymm1, %eax +; KNL-NEXT: vpmovmskb %ymm0, %eax ; KNL-NEXT: shlq $32, %rax -; KNL-NEXT: vpmovmskb %ymm0, %ecx +; KNL-NEXT: vpmovmskb %ymm1, %ecx ; KNL-NEXT: orq %rax, %rcx ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper @@ -1686,13 +1686,13 @@ ; ; KNL-LABEL: allones_v32i16_and1: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vpsllw $15, %ymm0, %ymm1 +; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 +; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpsllw $15, %ymm1, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1766,13 +1766,13 @@ ; ; KNL-LABEL: allzeros_v32i16_and1: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vpsllw $15, %ymm0, %ymm1 +; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 +; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpsllw $15, %ymm1, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -2784,12 +2784,12 @@ ; ; KNL-LABEL: allones_v64i8_and4: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpsllw $5, %ymm0, %ymm1 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpsllw $5, %ymm0, %ymm0 -; KNL-NEXT: vpsllw $5, %ymm1, %ymm1 -; KNL-NEXT: vpmovmskb %ymm1, %eax +; KNL-NEXT: vpmovmskb %ymm0, %eax ; KNL-NEXT: shlq $32, %rax -; KNL-NEXT: vpmovmskb %ymm0, %ecx +; KNL-NEXT: vpmovmskb %ymm1, %ecx ; KNL-NEXT: orq %rax, %rcx ; KNL-NEXT: cmpq $-1, %rcx ; KNL-NEXT: sete %al @@ -2866,12 +2866,12 @@ ; ; KNL-LABEL: allzeros_v64i8_and4: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpsllw $5, %ymm0, %ymm1 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpsllw $5, %ymm0, %ymm0 -; KNL-NEXT: vpsllw $5, %ymm1, %ymm1 -; KNL-NEXT: vpmovmskb %ymm1, %eax +; KNL-NEXT: vpmovmskb %ymm0, %eax ; KNL-NEXT: shlq $32, %rax -; KNL-NEXT: vpmovmskb %ymm0, %ecx +; KNL-NEXT: vpmovmskb %ymm1, %ecx ; KNL-NEXT: orq %rax, %rcx ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper @@ -3089,13 +3089,13 @@ ; ; KNL-LABEL: allones_v32i16_and4: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vpsllw $13, %ymm0, %ymm1 +; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 +; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpsllw $13, %ymm1, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -3169,13 +3169,13 @@ ; ; KNL-LABEL: allzeros_v32i16_and4: ; KNL: # %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vpsllw $13, %ymm0, %ymm1 +; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 +; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpsllw $13, %ymm1, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 diff --git a/llvm/test/CodeGen/X86/nontemporal-loads-2.ll b/llvm/test/CodeGen/X86/nontemporal-loads-2.ll --- a/llvm/test/CodeGen/X86/nontemporal-loads-2.ll +++ b/llvm/test/CodeGen/X86/nontemporal-loads-2.ll @@ -921,44 +921,24 @@ ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512DQ-LABEL: test_v32i16_align16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: pushq %rbp -; AVX512DQ-NEXT: movq %rsp, %rbp -; AVX512DQ-NEXT: andq $-32, %rsp -; AVX512DQ-NEXT: subq $96, %rsp -; AVX512DQ-NEXT: vmovntdqa 16(%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX512DQ-NEXT: vmovntdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512DQ-NEXT: vmovntdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX512DQ-NEXT: vmovaps (%rsp), %ymm0 -; AVX512DQ-NEXT: vinsertf64x4 $1, {{[0-9]+}}(%rsp), %zmm0, %zmm0 -; AVX512DQ-NEXT: movq %rbp, %rsp -; AVX512DQ-NEXT: popq %rbp -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: test_v32i16_align16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: pushq %rbp -; AVX512BW-NEXT: movq %rsp, %rbp -; AVX512BW-NEXT: andq $-64, %rsp -; AVX512BW-NEXT: subq $128, %rsp -; AVX512BW-NEXT: vmovntdqa 48(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX512BW-NEXT: vmovntdqa 32(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX512BW-NEXT: vmovntdqa 16(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX512BW-NEXT: vmovntdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512BW-NEXT: vmovaps (%rsp), %zmm0 -; AVX512BW-NEXT: movq %rbp, %rsp -; AVX512BW-NEXT: popq %rbp -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_v32i16_align16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) +; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %src, align 16, !nontemporal !1 ret <32 x i16> %1 } @@ -1020,44 +1000,24 @@ ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512DQ-LABEL: test_v64i8_align16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: pushq %rbp -; AVX512DQ-NEXT: movq %rsp, %rbp -; AVX512DQ-NEXT: andq $-32, %rsp -; AVX512DQ-NEXT: subq $96, %rsp -; AVX512DQ-NEXT: vmovntdqa 16(%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX512DQ-NEXT: vmovntdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512DQ-NEXT: vmovntdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX512DQ-NEXT: vmovaps (%rsp), %ymm0 -; AVX512DQ-NEXT: vinsertf64x4 $1, {{[0-9]+}}(%rsp), %zmm0, %zmm0 -; AVX512DQ-NEXT: movq %rbp, %rsp -; AVX512DQ-NEXT: popq %rbp -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: test_v64i8_align16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: pushq %rbp -; AVX512BW-NEXT: movq %rsp, %rbp -; AVX512BW-NEXT: andq $-64, %rsp -; AVX512BW-NEXT: subq $128, %rsp -; AVX512BW-NEXT: vmovntdqa 48(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX512BW-NEXT: vmovntdqa 32(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX512BW-NEXT: vmovntdqa 16(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX512BW-NEXT: vmovntdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512BW-NEXT: vmovaps (%rsp), %zmm0 -; AVX512BW-NEXT: movq %rbp, %rsp -; AVX512BW-NEXT: popq %rbp -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_v64i8_align16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) +; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %src, align 16, !nontemporal !1 ret <64 x i8> %1 } @@ -1299,27 +1259,20 @@ ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 ; AVX2-NEXT: retq ; -; AVX512DQ-LABEL: test_v32i16_align32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovntdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: test_v32i16_align32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: pushq %rbp -; AVX512BW-NEXT: movq %rsp, %rbp -; AVX512BW-NEXT: andq $-64, %rsp -; AVX512BW-NEXT: subq $128, %rsp -; AVX512BW-NEXT: vmovntdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) -; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa %ymm0, (%rsp) -; AVX512BW-NEXT: vmovaps (%rsp), %zmm0 -; AVX512BW-NEXT: movq %rbp, %rsp -; AVX512BW-NEXT: popq %rbp -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_v32i16_align32: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) +; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %src, align 32, !nontemporal !1 ret <32 x i16> %1 } @@ -1357,27 +1310,20 @@ ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 ; AVX2-NEXT: retq ; -; AVX512DQ-LABEL: test_v64i8_align32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovntdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: test_v64i8_align32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: pushq %rbp -; AVX512BW-NEXT: movq %rsp, %rbp -; AVX512BW-NEXT: andq $-64, %rsp -; AVX512BW-NEXT: subq $128, %rsp -; AVX512BW-NEXT: vmovntdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) -; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa %ymm0, (%rsp) -; AVX512BW-NEXT: vmovaps (%rsp), %zmm0 -; AVX512BW-NEXT: movq %rbp, %rsp -; AVX512BW-NEXT: popq %rbp -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_v64i8_align32: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) +; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %src, align 32, !nontemporal !1 ret <64 x i8> %1 } diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll --- a/llvm/test/CodeGen/X86/nontemporal-loads.ll +++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll @@ -1275,10 +1275,10 @@ ; ; AVX512F-LABEL: test_arg_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vmovntdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -1291,10 +1291,10 @@ ; ; AVX512VL-LABEL: test_arg_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm2 -; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm3 -; AVX512VL-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq @@ -1350,10 +1350,10 @@ ; ; AVX512F-LABEL: test_arg_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vmovntdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -1366,10 +1366,10 @@ ; ; AVX512VL-LABEL: test_arg_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm2 -; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm3 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll --- a/llvm/test/CodeGen/X86/pmaddubsw.ll +++ b/llvm/test/CodeGen/X86/pmaddubsw.ll @@ -156,10 +156,12 @@ ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm2 ; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm3 -; AVX512F-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm1 ; AVX512F-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2 -; AVX512F-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: pmaddubsw_512: diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -975,18 +975,18 @@ ; ; AVX512F-LABEL: mul_v64i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] -; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] +; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] -; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] +; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3 diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -219,10 +219,16 @@ ; ; AVX512F-LABEL: mulhuw_v64i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512F-NEXT: vpmulhuw %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vpmulhuw %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mulhuw_v64i16: @@ -270,10 +276,16 @@ ; ; AVX512F-LABEL: mulhw_v64i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmulhw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpmulhw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpmulhw %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpmulhw %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512F-NEXT: vpmulhw %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vpmulhw %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mulhw_v64i16: diff --git a/llvm/test/CodeGen/X86/pr45443.ll b/llvm/test/CodeGen/X86/pr45443.ll --- a/llvm/test/CodeGen/X86/pr45443.ll +++ b/llvm/test/CodeGen/X86/pr45443.ll @@ -3,10 +3,33 @@ ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64 define <16 x float> @PR45443() { -; CHECK-LABEL: PR45443: -; CHECK: # %bb.0: # %bb -; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: PR45443: +; X86: # %bb.0: # %bb +; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080] +; X86-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm0 +; X86-NEXT: vpcmpltud {{\.LCPI.*}}{1to16}, %zmm1, %k1 +; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; X86-NEXT: vpbroadcastd {{.*#+}} ymm3 = [16777215,16777215,16777215,16777215,16777215,16777215,16777215,16777215] +; X86-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X86-NEXT: vpand %ymm3, %ymm1, %ymm1 +; X86-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; X86-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} +; X86-NEXT: vbroadcastss {{\.LCPI.*}}, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: PR45443: +; X64: # %bb.0: # %bb +; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080] +; X64-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm0 +; X64-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm1, %k1 +; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; X64-NEXT: vpbroadcastd {{.*#+}} ymm3 = [16777215,16777215,16777215,16777215,16777215,16777215,16777215,16777215] +; X64-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X64-NEXT: vpand %ymm3, %ymm1, %ymm1 +; X64-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; X64-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} +; X64-NEXT: vbroadcastss {{.*}}(%rip), %zmm0 {%k1} +; X64-NEXT: retq bb: %tmp = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> , <4 x i32> ) %tmp4 = tail call fast <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> , <16 x float> undef) diff --git a/llvm/test/CodeGen/X86/var-permute-512.ll b/llvm/test/CodeGen/X86/var-permute-512.ll --- a/llvm/test/CodeGen/X86/var-permute-512.ll +++ b/llvm/test/CodeGen/X86/var-permute-512.ll @@ -97,176 +97,116 @@ ; AVX512F-NEXT: pushq %rbp ; AVX512F-NEXT: movq %rsp, %rbp ; AVX512F-NEXT: andq $-64, %rsp -; AVX512F-NEXT: subq $2112, %rsp # imm = 0x840 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-NEXT: subq $128, %rsp +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4 ; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, (%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: vmovaps %zmm0, (%rsp) ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: movzwl 1536(%rsp,%rax,2), %eax +; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vpextrw $1, %xmm4, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $1, 1600(%rsp,%rax,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrw $2, %xmm4, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $2, 1664(%rsp,%rax,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrw $3, %xmm4, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $3, 1728(%rsp,%rax,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrw $4, %xmm4, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $4, 1792(%rsp,%rax,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrw $5, %xmm4, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $5, 1856(%rsp,%rax,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrw $6, %xmm4, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $6, 1920(%rsp,%rax,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrw $7, %xmm4, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $7, 1984(%rsp,%rax,2), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm2, %eax +; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm3, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzwl 1024(%rsp,%rax,2), %eax +; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrw $1, %xmm2, %eax +; AVX512F-NEXT: vpextrw $1, %xmm3, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $1, 1088(%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $2, %xmm2, %eax +; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $2, %xmm3, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $2, 1152(%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $3, %xmm2, %eax +; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $3, %xmm3, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $3, 1216(%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $4, %xmm2, %eax +; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $4, %xmm3, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $4, 1280(%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $5, %xmm2, %eax +; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $5, %xmm3, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $5, 1344(%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $6, %xmm2, %eax +; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $6, %xmm3, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $6, 1408(%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $7, %xmm2, %eax +; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $7, %xmm3, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $7, 1472(%rsp,%rax,2), %xmm4, %xmm2 -; AVX512F-NEXT: vmovd %xmm3, %eax +; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm3 +; AVX512F-NEXT: vmovd %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzwl 512(%rsp,%rax,2), %eax +; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrw $1, %xmm3, %eax +; AVX512F-NEXT: vpextrw $1, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $1, 576(%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $2, %xmm3, %eax +; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $2, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $2, 640(%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $3, %xmm3, %eax +; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $3, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $3, 704(%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $4, %xmm3, %eax +; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $4, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $4, 768(%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $5, %xmm3, %eax +; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $5, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $5, 832(%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $6, %xmm3, %eax +; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax +; AVX512F-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $6, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $6, 896(%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $7, %xmm3, %eax +; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax +; AVX512F-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $7, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $7, 960(%rsp,%rax,2), %xmm4, %xmm3 +; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax +; AVX512F-NEXT: vpinsrw $7, %eax, %xmm4, %xmm2 ; AVX512F-NEXT: vmovd %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX512F-NEXT: vmovd %eax, %xmm4 ; AVX512F-NEXT: vpextrw $1, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $1, 64(%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrw $2, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $2, 128(%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrw $3, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $3, 192(%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrw $4, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $4, 256(%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrw $5, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $5, 320(%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrw $6, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $6, 384(%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrw $7, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $7, 448(%rsp,%rax,2), %xmm4, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp @@ -386,336 +326,221 @@ ; AVX512F-NEXT: pushq %rbp ; AVX512F-NEXT: movq %rsp, %rbp ; AVX512F-NEXT: andq $-64, %rsp -; AVX512F-NEXT: subq $4160, %rsp # imm = 0x1040 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-NEXT: subq $128, %rsp +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4 ; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, (%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: movzbl 3072(%rsp,%rax), %eax +; AVX512F-NEXT: vmovaps %zmm0, (%rsp) +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vpextrb $1, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, 3136(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $2, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, 3200(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $3, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, 3264(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $4, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, 3328(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $5, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, 3392(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $6, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, 3456(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $7, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, 3520(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $8, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, 3584(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $9, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, 3648(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $10, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $10, 3712(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $11, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, 3776(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $12, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $12, 3840(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $13, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $13, 3904(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $14, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $14, 3968(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $15, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $15, 4032(%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm2, %eax +; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl 2048(%rsp,%rax), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrb $1, %xmm2, %eax +; AVX512F-NEXT: vpextrb $1, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, 2112(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $2, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $2, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, 2176(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $3, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $3, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, 2240(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $4, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $4, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, 2304(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $5, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $5, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, 2368(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $6, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $6, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, 2432(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $7, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $7, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, 2496(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $8, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $8, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, 2560(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $9, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $9, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, 2624(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $10, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $10, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $10, 2688(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $11, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $11, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, 2752(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $12, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $12, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $12, 2816(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $13, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $13, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $13, 2880(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $14, %xmm2, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $14, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $14, 2944(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $15, %xmm2, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $15, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $15, 3008(%rsp,%rax), %xmm4, %xmm2 -; AVX512F-NEXT: vmovd %xmm3, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 +; AVX512F-NEXT: vmovd %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl 1024(%rsp,%rax), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrb $1, %xmm3, %eax +; AVX512F-NEXT: vpextrb $1, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, 1088(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $2, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $2, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, 1152(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $3, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $3, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, 1216(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $4, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $4, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, 1280(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $5, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $5, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, 1344(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $6, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $6, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, 1408(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $7, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $7, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, 1472(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $8, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $8, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, 1536(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $9, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $9, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, 1600(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $10, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $10, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $10, 1664(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $11, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $11, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, 1728(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $12, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $12, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $12, 1792(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $13, %xmm3, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $13, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $13, 1856(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $14, %xmm3, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $14, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $14, 1920(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $15, %xmm3, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $15, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $15, 1984(%rsp,%rax), %xmm4, %xmm3 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 ; AVX512F-NEXT: vmovd %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm4 ; AVX512F-NEXT: vpextrb $1, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, 64(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $2, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, 128(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $3, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, 192(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $4, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, 256(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $5, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, 320(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $6, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, 384(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $7, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, 448(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $8, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, 512(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $9, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, 576(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $10, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $10, 640(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $11, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, 704(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $12, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $12, 768(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $13, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $13, 832(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $14, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $14, 896(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $15, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $15, 960(%rsp,%rax), %xmm4, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp @@ -1239,359 +1064,246 @@ ; AVX512F-NEXT: pushq %rbp ; AVX512F-NEXT: movq %rsp, %rbp ; AVX512F-NEXT: andq $-64, %rsp -; AVX512F-NEXT: subq $4160, %rsp # imm = 0x1040 +; AVX512F-NEXT: subq $128, %rsp ; AVX512F-NEXT: # kill: def $esi killed $esi def $rsi -; AVX512F-NEXT: vpbroadcastd %esi, %zmm4 -; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm4, %zmm1 -; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm4, %zmm2 -; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm4, %zmm3 -; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm4, %zmm4 -; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %ymm0, (%rsp) -; AVX512F-NEXT: movzbl 3968(%rsp,%rax), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vpextrd $1, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, 3904(%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $2, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, 3840(%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $3, %xmm4, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, 3776(%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm5, %eax +; AVX512F-NEXT: vpbroadcastd %esi, %zmm2 +; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm2, %zmm1 +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: vmovaps %zmm0, (%rsp) ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, 3712(%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $1, %xmm5, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vpextrd $1, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, 3648(%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $2, %xmm5, %eax +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $2, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, 3584(%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm5 +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $3, %xmm1, %eax +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, 3520(%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm5, %eax +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, 3456(%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $1, %xmm5, %eax +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $1, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, 3392(%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $2, %xmm5, %eax +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $2, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $10, 3328(%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $3, %xmm3, %eax +; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm4 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, 3264(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $12, 3200(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrd $1, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $13, 3136(%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $2, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $14, 3072(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm2, %zmm5 +; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm2, %zmm3 +; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrd $3, %xmm4, %eax +; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $15, 3008(%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm3, %eax +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl 2944(%rsp,%rax), %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrd $1, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $1, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, 2880(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $2, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $2, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, 2816(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $3, %xmm3, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $3, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, 2752(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm8 ; AVX512F-NEXT: vmovd %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, 2688(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 ; AVX512F-NEXT: vpextrd $1, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, 2624(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm1, %xmm1 ; AVX512F-NEXT: vpextrd $2, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, 2560(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm5 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, 2496(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vmovd %xmm5, %eax +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1 +; AVX512F-NEXT: vmovd %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, 2432(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $1, %xmm5, %eax +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm1, %xmm1 +; AVX512F-NEXT: vpextrd $1, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, 2368(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $2, %xmm5, %eax +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm1, %xmm1 +; AVX512F-NEXT: vpextrd $2, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $10, 2304(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm3 +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1 +; AVX512F-NEXT: vpextrd $3, %xmm4, %eax +; AVX512F-NEXT: vextracti32x4 $2, %zmm5, %xmm4 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, 2240(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1 +; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm1, %xmm1 +; AVX512F-NEXT: vpextrd $1, %xmm4, %eax +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm1, %xmm1 +; AVX512F-NEXT: vpextrd $2, %xmm4, %eax +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6 +; AVX512F-NEXT: vpextrd $3, %xmm4, %eax +; AVX512F-NEXT: vextracti32x4 $3, %zmm5, %xmm1 +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4 +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vmovd %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $12, 2176(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vmovd %eax, %xmm5 ; AVX512F-NEXT: vpextrd $1, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $13, 2112(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm5, %xmm5 ; AVX512F-NEXT: vpextrd $2, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $14, 2048(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm5, %xmm5 ; AVX512F-NEXT: vpextrd $3, %xmm3, %eax +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $15, 1984(%rsp,%rax), %xmm4, %xmm3 -; AVX512F-NEXT: vmovd %xmm2, %eax +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vmovd %xmm6, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl 1920(%rsp,%rax), %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrd $1, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $1, %xmm6, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, 1856(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $2, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $2, %xmm6, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, 1792(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $3, %xmm2, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm7 +; AVX512F-NEXT: vpextrd $3, %xmm6, %eax +; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm5 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, 1728(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm6 ; AVX512F-NEXT: vmovd %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, 1664(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm6, %xmm6 ; AVX512F-NEXT: vpextrd $1, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, 1600(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm6, %xmm6 ; AVX512F-NEXT: vpextrd $2, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, 1536(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm5 +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vmovd %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, 1472(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vmovd %xmm5, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vmovd %eax, %xmm7 +; AVX512F-NEXT: vpextrd $1, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, 1408(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $1, %xmm5, %eax +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $2, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, 1344(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $2, %xmm5, %eax +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $3, %xmm2, %eax +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $10, 1280(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm7, %xmm7 +; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, 1216(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vmovd %xmm2, %eax +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $1, %xmm0, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $12, 1152(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $1, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $2, %xmm0, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $13, 1088(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $2, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $3, %xmm0, %eax +; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm0 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $14, 1024(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $3, %xmm2, %eax +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm7 +; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $15, 960(%rsp,%rax), %xmm4, %xmm2 -; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $1, %xmm0, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl 896(%rsp,%rax), %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrd $1, %xmm1, %eax +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $2, %xmm0, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, 832(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $2, %xmm1, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7 +; AVX512F-NEXT: vpextrd $3, %xmm0, %eax +; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm0 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, 768(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $3, %xmm1, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm7, %xmm2 +; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, 704(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vmovd %xmm5, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: vpextrd $1, %xmm0, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, 640(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $1, %xmm5, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: vpextrd $2, %xmm0, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, 576(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $2, %xmm5, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: vpextrd $3, %xmm0, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, 512(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm2 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, 448(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vmovd %xmm5, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm6, %xmm3 +; AVX512F-NEXT: vmovd %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, 384(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $1, %xmm5, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: vpextrd $1, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, 320(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: andl $63, %esi -; AVX512F-NEXT: vpinsrb $10, 4032(%rsp,%rsi), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: vpextrd $2, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, 256(%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX512F-NEXT: vpextrd $3, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $12, 192(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2 ; AVX512F-NEXT: vpextrd $1, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $13, 128(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm3 ; AVX512F-NEXT: vpextrd $2, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $14, 64(%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; AVX512F-NEXT: vpextrd $3, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm4, %xmm1 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vcvtdq2ps %zmm0, %zmm0 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 ; AVX512F-NEXT: vcvtdq2ps %zmm2, %zmm2 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1 -; AVX512F-NEXT: vmovaps %zmm1, 192(%rdi) -; AVX512F-NEXT: vmovaps %zmm2, 128(%rdi) -; AVX512F-NEXT: vmovaps %zmm3, 64(%rdi) +; AVX512F-NEXT: vpmovsxbd %xmm8, %zmm3 +; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3 +; AVX512F-NEXT: vmovaps %zmm3, 192(%rdi) +; AVX512F-NEXT: vmovaps %zmm1, 128(%rdi) +; AVX512F-NEXT: vmovaps %zmm2, 64(%rdi) ; AVX512F-NEXT: vmovaps %zmm0, (%rdi) ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll --- a/llvm/test/CodeGen/X86/vector-compare-results.ll +++ b/llvm/test/CodeGen/X86/vector-compare-results.ll @@ -828,19 +828,19 @@ ; AVX512F-LABEL: test_cmp_v64i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k3 ; AVX512F-NEXT: kmovw %k3, 6(%rdi) ; AVX512F-NEXT: kmovw %k2, 4(%rdi) @@ -852,19 +852,19 @@ ; AVX512DQ-LABEL: test_cmp_v64i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq %rdi, %rax -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-NEXT: kmovw %k3, 6(%rdi) ; AVX512DQ-NEXT: kmovw %k2, 4(%rdi) @@ -1472,16 +1472,20 @@ ; AVX512F-LABEL: test_cmp_v64i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm0 +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 +; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm0 +; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k3 ; AVX512F-NEXT: kmovw %k3, 6(%rdi) @@ -1494,16 +1498,20 @@ ; AVX512DQ-LABEL: test_cmp_v64i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq %rdi, %rax -; AVX512DQ-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 -; AVX512DQ-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm0 +; AVX512DQ-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpmovsxwd %ymm4, %zmm4 +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm0 +; AVX512DQ-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-NEXT: kmovw %k3, 6(%rdi) @@ -1627,25 +1635,29 @@ ; AVX512F-LABEL: test_cmp_v128i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm4 -; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm5 +; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k0 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm4 +; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm2 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k2 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k3 -; AVX512F-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k4 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm2 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k4 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k5 -; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k6 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -1665,25 +1677,29 @@ ; AVX512DQ-LABEL: test_cmp_v128i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq %rdi, %rax -; AVX512DQ-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm4 -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 +; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpmovsxbd %xmm4, %zmm5 +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-NEXT: vpmovsxbd %xmm4, %zmm4 +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm2 +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k2 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 -; AVX512DQ-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 +; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm0 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm2 +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k4 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k5 -; AVX512DQ-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -150,72 +150,72 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpsllvd %zmm4, %zmm3, %zmm3 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero -; AVX512F-NEXT: vpsllvd %zmm7, %zmm8, %zmm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %ymm5, %ymm8, %ymm9 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpsrlvd %zmm9, %zmm3, %zmm3 -; AVX512F-NEXT: vpord %zmm3, %zmm7, %zmm3 -; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpsllvd %zmm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpsubw %ymm2, %ymm8, %ymm5 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vpsllvd %zmm6, %zmm4, %zmm4 +; AVX512F-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm6 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpsrlvd %zmm6, %zmm7, %zmm6 +; AVX512F-NEXT: vpmovdw %zmm6, %ymm6 +; AVX512F-NEXT: vpsubw %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpsrlvd %zmm5, %zmm1, %zmm1 -; AVX512F-NEXT: vpord %zmm1, %zmm4, %zmm1 +; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 +; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm5, %ymm4 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512VL-NEXT: vpsllvd %zmm4, %zmm3, %zmm3 +; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero -; AVX512VL-NEXT: vpsllvd %zmm7, %zmm8, %zmm7 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %ymm5, %ymm8, %ymm9 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm9, %zmm3, %zmm3 -; AVX512VL-NEXT: vpord %zmm3, %zmm7, %zmm3 -; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 -; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpsllvd %zmm4, %zmm5, %zmm4 -; AVX512VL-NEXT: vpsubw %ymm2, %ymm8, %ymm5 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512VL-NEXT: vpsllvd %zmm6, %zmm4, %zmm4 +; AVX512VL-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm6 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm6, %zmm7, %zmm6 +; AVX512VL-NEXT: vpmovdw %zmm6, %ymm6 +; AVX512VL-NEXT: vpsubw %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm1, %zmm1 -; AVX512VL-NEXT: vpord %zmm1, %zmm4, %zmm1 +; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 ; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 +; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512VL-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v32i16: @@ -260,146 +260,142 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm8 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm6 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm9 -; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm10 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm7, %ymm3, %ymm7 -; AVX512F-NEXT: vpsllw $2, %ymm7, %ymm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512F-NEXT: vpand %ymm6, %ymm11, %ymm11 -; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm7 -; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm11 -; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm10 -; AVX512F-NEXT: vpsrlw $4, %ymm8, %ymm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm7, %ymm11, %ymm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %ymm9, %ymm12, %ymm13 -; AVX512F-NEXT: vpsllw $5, %ymm13, %ymm13 -; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8 -; AVX512F-NEXT: vpsrlw $2, %ymm8, %ymm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX512F-NEXT: vpand %ymm14, %ymm11, %ymm11 -; AVX512F-NEXT: vpaddb %ymm13, %ymm13, %ymm13 -; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8 -; AVX512F-NEXT: vpsrlw $1, %ymm8, %ymm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm15, %ymm11, %ymm11 -; AVX512F-NEXT: vpaddb %ymm13, %ymm13, %ymm13 -; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8 -; AVX512F-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX512F-NEXT: vpxor %xmm10, %xmm10, %xmm10 -; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm9, %ymm9 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm8 -; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4 -; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm5 -; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4 -; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm8 -; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 -; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm6 -; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm5 -; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpsubb %ymm2, %ymm12, %ymm6 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vpsrlw $4, %ymm4, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm6 +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm8 +; AVX512F-NEXT: vpsllw $5, %ymm8, %ymm8 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6 +; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm10, %ymm6 +; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6 +; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm6 ; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm5 -; AVX512F-NEXT: vpand %ymm5, %ymm14, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm5 -; AVX512F-NEXT: vpand %ymm5, %ymm15, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm10, %ymm5 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm10, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm7 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm5 +; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm5 +; AVX512F-NEXT: vpsllw $2, %ymm5, %ymm7 +; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm7 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-NEXT: vporq %zmm1, %zmm4, %zmm1 +; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v64i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 -; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm6 -; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm10 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm11 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpand %ymm11, %ymm10, %ymm10 -; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6 -; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm10 -; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6 -; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm9 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %ymm5, %ymm12, %ymm13 -; AVX512VL-NEXT: vpsllw $5, %ymm13, %ymm13 -; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm9 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm14 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX512VL-NEXT: vpand %ymm14, %ymm9, %ymm9 -; AVX512VL-NEXT: vpaddb %ymm13, %ymm13, %ymm13 -; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm9 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm15 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-NEXT: vpand %ymm15, %ymm9, %ymm9 -; AVX512VL-NEXT: vpaddb %ymm13, %ymm13, %ymm13 -; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4 -; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpand %ymm2, %ymm8, %ymm2 -; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm7 -; AVX512VL-NEXT: vpand %ymm7, %ymm11, %ymm7 -; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7 -; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm10, %ymm5 -; AVX512VL-NEXT: vpsubb %ymm2, %ymm12, %ymm7 -; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm6 +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %ymm3, %ymm7, %ymm8 +; AVX512VL-NEXT: vpsllw $5, %ymm8, %ymm8 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6 +; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5 +; AVX512VL-NEXT: vpsubb %ymm2, %ymm7, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm14, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm15, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm10, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm7 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm5 +; AVX512VL-NEXT: vpsllw $2, %ymm5, %ymm7 +; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm7 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512VL-NEXT: vporq %zmm1, %zmm4, %zmm1 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512VL-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v64i8: @@ -722,48 +718,58 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm5, %ymm4, %ymm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %xmm2, %xmm7, %xmm7 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm4 -; AVX512F-NEXT: vpsrlw %xmm7, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vpsllw %xmm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpsubw %xmm4, %xmm5, %xmm6 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm7 +; AVX512F-NEXT: vpsrlw %xmm6, %ymm7, %ymm6 +; AVX512F-NEXT: vpsubw %xmm2, %xmm5, %xmm5 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-NEXT: vpsrlw %xmm5, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 +; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm5, %ymm4, %ymm6 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %xmm2, %xmm7, %xmm7 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm7, %ymm3, %ymm3 -; AVX512VL-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm4 -; AVX512VL-NEXT: vpsrlw %xmm7, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpsubw %xmm4, %xmm5, %xmm6 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm7 +; AVX512VL-NEXT: vpsrlw %xmm6, %ymm7, %ymm6 +; AVX512VL-NEXT: vpsubw %xmm2, %xmm5, %xmm5 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 +; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512VL-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: @@ -817,68 +823,84 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm9 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm5, %ymm4, %ymm6 -; AVX512F-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 -; AVX512F-NEXT: vpsllw %xmm5, %xmm8, %xmm7 -; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7 -; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm10 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm3, %ymm9, %ymm9 -; AVX512F-NEXT: vpsrlw %xmm3, %xmm8, %xmm6 -; AVX512F-NEXT: vpsrlw $8, %xmm6, %xmm6 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vpsllw %xmm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6 -; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm8 -; AVX512F-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm9, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4 -; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm5 -; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpsubb %xmm4, %xmm6, %xmm7 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm8 +; AVX512F-NEXT: vpsrlw %xmm7, %ymm8, %ymm8 +; AVX512F-NEXT: vpsrlw %xmm7, %xmm5, %xmm7 +; AVX512F-NEXT: vpsrlw $8, %xmm7, %xmm7 +; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7 +; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7 +; AVX512F-NEXT: vpsubb %xmm2, %xmm6, %xmm6 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsrlw %xmm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw %xmm6, %xmm5, %xmm5 +; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5 +; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 +; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm9 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm5, %ymm4, %ymm6 -; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 -; AVX512VL-NEXT: vpsllw %xmm5, %xmm8, %xmm7 -; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7 -; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm10 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm3, %ymm9, %ymm9 -; AVX512VL-NEXT: vpsrlw %xmm3, %xmm8, %xmm6 -; AVX512VL-NEXT: vpsrlw $8, %xmm6, %xmm6 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6 -; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm8 -; AVX512VL-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX512VL-NEXT: vpcmpeqb %ymm2, %ymm9, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4 -; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm5 -; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %xmm4, %xmm6, %xmm7 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm8 +; AVX512VL-NEXT: vpsrlw %xmm7, %ymm8, %ymm8 +; AVX512VL-NEXT: vpsrlw %xmm7, %xmm5, %xmm7 +; AVX512VL-NEXT: vpsrlw $8, %xmm7, %xmm7 +; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7 +; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7 +; AVX512VL-NEXT: vpsubb %xmm2, %xmm6, %xmm6 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm6, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw %xmm6, %xmm5, %xmm5 +; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5 +; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 +; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512VL-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: @@ -1072,16 +1094,16 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; AVX512F-LABEL: constant_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm5 -; AVX512F-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vpmulhuw %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm5 +; AVX512F-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -1090,16 +1112,16 @@ ; ; AVX512VL-LABEL: constant_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpmullw %ymm4, %ymm2, %ymm5 -; AVX512VL-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512VL-NEXT: vpmullw %ymm3, %ymm4, %ymm5 +; AVX512VL-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2,3,4,5,6,7] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -1144,107 +1166,107 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F-LABEL: constant_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] -; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7 -; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm9 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7 -; AVX512F-NEXT: vpaddb %ymm9, %ymm9, %ymm10 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm7, %ymm4, %ymm4 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm7[8],ymm2[9],ymm7[9],ymm2[10],ymm7[10],ymm2[11],ymm7[11],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15],ymm2[24],ymm7[24],ymm2[25],ymm7[25],ymm2[26],ymm7[26],ymm2[27],ymm7[27],ymm2[28],ymm7[28],ymm2[29],ymm7[29],ymm2[30],ymm7[30],ymm2[31],ymm7[31] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm8 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm6 +; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm9 +; AVX512F-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; AVX512F-NEXT: vpxor %xmm10, %xmm10, %xmm10 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11],ymm6[12],ymm10[12],ymm6[13],ymm10[13],ymm6[14],ymm10[14],ymm6[15],ymm10[15],ymm6[24],ymm10[24],ymm6[25],ymm10[25],ymm6[26],ymm10[26],ymm6[27],ymm10[27],ymm6[28],ymm10[28],ymm6[29],ymm10[29],ymm6[30],ymm10[30],ymm6[31],ymm10[31] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512F-NEXT: # ymm12 = mem[0,1,0,1] ; AVX512F-NEXT: vpmullw %ymm12, %ymm11, %ymm11 ; AVX512F-NEXT: vpsrlw $8, %ymm11, %ymm11 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[16],ymm7[16],ymm2[17],ymm7[17],ymm2[18],ymm7[18],ymm2[19],ymm7[19],ymm2[20],ymm7[20],ymm2[21],ymm7[21],ymm2[22],ymm7[22],ymm2[23],ymm7[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[16],ymm10[16],ymm6[17],ymm10[17],ymm6[18],ymm10[18],ymm6[19],ymm10[19],ymm6[20],ymm10[20],ymm6[21],ymm10[21],ymm6[22],ymm10[22],ymm6[23],ymm10[23] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] ; AVX512F-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-NEXT: vpmullw %ymm2, %ymm13, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpackuswb %ymm11, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmullw %ymm6, %ymm13, %ymm6 +; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6 +; AVX512F-NEXT: vpackuswb %ymm11, %ymm6, %ymm6 +; AVX512F-NEXT: vpor %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm6 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] +; AVX512F-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3 -; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm5 -; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm5 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15],ymm1[24],ymm7[24],ymm1[25],ymm7[25],ymm1[26],ymm7[26],ymm1[27],ymm7[27],ymm1[28],ymm7[28],ymm1[29],ymm7[29],ymm1[30],ymm7[30],ymm1[31],ymm7[31] -; AVX512F-NEXT: vpmullw %ymm5, %ymm12, %ymm5 -; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[16],ymm7[16],ymm1[17],ymm7[17],ymm1[18],ymm7[18],ymm1[19],ymm7[19],ymm1[20],ymm7[20],ymm1[21],ymm7[21],ymm1[22],ymm7[22],ymm1[23],ymm7[23] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm10[8],ymm1[9],ymm10[9],ymm1[10],ymm10[10],ymm1[11],ymm10[11],ymm1[12],ymm10[12],ymm1[13],ymm10[13],ymm1[14],ymm10[14],ymm1[15],ymm10[15],ymm1[24],ymm10[24],ymm1[25],ymm10[25],ymm1[26],ymm10[26],ymm1[27],ymm10[27],ymm1[28],ymm10[28],ymm1[29],ymm10[29],ymm1[30],ymm10[30],ymm1[31],ymm10[31] +; AVX512F-NEXT: vpmullw %ymm4, %ymm12, %ymm4 +; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[4],ymm10[4],ymm1[5],ymm10[5],ymm1[6],ymm10[6],ymm1[7],ymm10[7],ymm1[16],ymm10[16],ymm1[17],ymm10[17],ymm1[18],ymm10[18],ymm1[19],ymm10[19],ymm1[20],ymm10[20],ymm1[21],ymm10[21],ymm1[22],ymm10[22],ymm1[23],ymm10[23] ; AVX512F-NEXT: vpmullw %ymm1, %ymm13, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] -; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm7 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7 -; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7 -; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm10 -; AVX512VL-NEXT: vpblendvb %ymm10, %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm7[8],ymm2[9],ymm7[9],ymm2[10],ymm7[10],ymm2[11],ymm7[11],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15],ymm2[24],ymm7[24],ymm2[25],ymm7[25],ymm2[26],ymm7[26],ymm2[27],ymm7[27],ymm2[28],ymm7[28],ymm2[29],ymm7[29],ymm2[30],ymm7[30],ymm2[31],ymm7[31] +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm8 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm9 +; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; AVX512VL-NEXT: vpxor %xmm10, %xmm10, %xmm10 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11],ymm6[12],ymm10[12],ymm6[13],ymm10[13],ymm6[14],ymm10[14],ymm6[15],ymm10[15],ymm6[24],ymm10[24],ymm6[25],ymm10[25],ymm6[26],ymm10[26],ymm6[27],ymm10[27],ymm6[28],ymm10[28],ymm6[29],ymm10[29],ymm6[30],ymm10[30],ymm6[31],ymm10[31] ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512VL-NEXT: # ymm12 = mem[0,1,0,1] ; AVX512VL-NEXT: vpmullw %ymm12, %ymm11, %ymm11 ; AVX512VL-NEXT: vpsrlw $8, %ymm11, %ymm11 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[16],ymm7[16],ymm2[17],ymm7[17],ymm2[18],ymm7[18],ymm2[19],ymm7[19],ymm2[20],ymm7[20],ymm2[21],ymm7[21],ymm2[22],ymm7[22],ymm2[23],ymm7[23] +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[16],ymm10[16],ymm6[17],ymm10[17],ymm6[18],ymm10[18],ymm6[19],ymm10[19],ymm6[20],ymm10[20],ymm6[21],ymm10[21],ymm6[22],ymm10[22],ymm6[23],ymm10[23] ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] ; AVX512VL-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm2, %ymm13, %ymm2 -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpackuswb %ymm11, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpmullw %ymm6, %ymm13, %ymm6 +; AVX512VL-NEXT: vpsrlw $8, %ymm6, %ymm6 +; AVX512VL-NEXT: vpackuswb %ymm11, %ymm6, %ymm6 +; AVX512VL-NEXT: vpor %ymm6, %ymm3, %ymm3 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm3 -; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15],ymm1[24],ymm7[24],ymm1[25],ymm7[25],ymm1[26],ymm7[26],ymm1[27],ymm7[27],ymm1[28],ymm7[28],ymm1[29],ymm7[29],ymm1[30],ymm7[30],ymm1[31],ymm7[31] -; AVX512VL-NEXT: vpmullw %ymm5, %ymm12, %ymm5 -; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[16],ymm7[16],ymm1[17],ymm7[17],ymm1[18],ymm7[18],ymm1[19],ymm7[19],ymm1[20],ymm7[20],ymm1[21],ymm7[21],ymm1[22],ymm7[22],ymm1[23],ymm7[23] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4 +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm10[8],ymm1[9],ymm10[9],ymm1[10],ymm10[10],ymm1[11],ymm10[11],ymm1[12],ymm10[12],ymm1[13],ymm10[13],ymm1[14],ymm10[14],ymm1[15],ymm10[15],ymm1[24],ymm10[24],ymm1[25],ymm10[25],ymm1[26],ymm10[26],ymm1[27],ymm10[27],ymm1[28],ymm10[28],ymm1[29],ymm10[29],ymm1[30],ymm10[30],ymm1[31],ymm10[31] +; AVX512VL-NEXT: vpmullw %ymm4, %ymm12, %ymm4 +; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[4],ymm10[4],ymm1[5],ymm10[5],ymm1[6],ymm10[6],ymm1[7],ymm10[7],ymm1[16],ymm10[16],ymm1[17],ymm10[17],ymm1[18],ymm10[18],ymm1[19],ymm10[19],ymm1[20],ymm10[20],ymm1[21],ymm10[21],ymm1[22],ymm10[22],ymm1[23],ymm10[23] ; AVX512VL-NEXT: vpmullw %ymm1, %ymm13, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -1466,28 +1488,28 @@ define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; AVX512F-LABEL: splatconstant_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpsrlw $9, %ymm3, %ymm3 -; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512VL-NEXT: vpsrlw $9, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i16: @@ -1520,34 +1542,40 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F-LABEL: splatconstant_funnnel_v64i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpandn %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpandn %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm4, %ymm2 -; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $226, %ymm1, %ymm4, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -34,56 +34,64 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpsllvd %zmm5, %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpord %zmm2, %zmm5, %zmm2 -; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 -; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512F-NEXT: vpsllvd %zmm4, %zmm2, %zmm4 +; AVX512F-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vpsrlvd %zmm5, %zmm2, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpsllvd %zmm5, %zmm2, %zmm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %ymm3, %ymm6, %ymm3 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 -; AVX512VL-NEXT: vpord %zmm2, %zmm5, %zmm2 -; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2 -; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512VL-NEXT: vpsllvd %zmm4, %zmm2, %zmm4 +; AVX512VL-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 -; AVX512VL-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm2, %zmm2 +; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v32i16: @@ -116,94 +124,138 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm8 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %ymm9, %ymm8, %ymm8 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm8 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm10, %ymm8, %ymm8 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm5 +; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm5 +; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4 +; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm6 +; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm6 +; AVX512F-NEXT: vpand %ymm6, %ymm10, %ymm6 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6 -; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6 -; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512F-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm7 -; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm7 -; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm8 -; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm4 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512F-NEXT: vpandn %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4 -; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5 -; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm7, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5 +; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-NEXT: vpsubb %ymm5, %ymm6, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3 +; AVX512VL-NEXT: vpsrlw $2, %ymm3, %ymm8 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %ymm9, %ymm8, %ymm8 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm8 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm10, %ymm8, %ymm8 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm4, %ymm5, %ymm4 +; AVX512VL-NEXT: vpsubb %ymm1, %ymm6, %ymm5 +; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm6 +; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm6 +; AVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8 -; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm4 +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm7, %ymm4 +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 -; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v64i8: @@ -314,38 +366,60 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vpsllw %xmm1, %ymm3, %ymm4 +; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpsrlvd %zmm5, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512F-NEXT: vpsrlvd %zmm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %ymm2 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpsllw %xmm1, %ymm3, %ymm4 +; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm2 +; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm3, %zmm2 +; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: @@ -385,58 +459,108 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm6 +; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX512F-NEXT: vpsubb %ymm6, %ymm7, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm4 +; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm9 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %ymm10, %ymm9, %ymm9 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm9 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm11, %ymm9, %ymm9 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm6 +; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm0, %ymm5 +; AVX512F-NEXT: vpsrlw $2, %ymm5, %ymm6 +; AVX512F-NEXT: vpand %ymm6, %ymm10, %ymm6 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm6 +; AVX512F-NEXT: vpand %ymm6, %ymm11, %ymm6 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6 -; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6 -; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 -; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2 +; AVX512F-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512F-NEXT: vpsllw %xmm1, %xmm4, %xmm4 +; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm2 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm3 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm6 +; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX512VL-NEXT: vpsubb %ymm6, %ymm7, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm4 +; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm9 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm9 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm11, %ymm9, %ymm9 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm6 +; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5 +; AVX512VL-NEXT: vpsubb %ymm3, %ymm7, %ymm3 +; AVX512VL-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm0, %ymm5 +; AVX512VL-NEXT: vpsrlw $2, %ymm5, %ymm6 +; AVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsrlw $1, %ymm5, %ymm6 +; AVX512VL-NEXT: vpand %ymm6, %ymm11, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6 -; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6 -; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 -; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2 +; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpsllw %xmm1, %xmm4, %xmm4 +; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: @@ -519,15 +643,15 @@ ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] -; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v32i16: @@ -537,15 +661,15 @@ ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1,2,3,4,5,6,7] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] -; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX512VL-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v32i16: @@ -583,36 +707,36 @@ ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] -; AVX512F-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9 -; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] -; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpmullw %ymm1, %ymm11, %ymm1 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] +; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3 -; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] -; AVX512F-NEXT: vpmullw %ymm3, %ymm10, %ymm3 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] -; AVX512F-NEXT: vpmullw %ymm0, %ymm11, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31] +; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23] +; AVX512F-NEXT: vpmullw %ymm6, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v64i8: @@ -632,36 +756,36 @@ ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] -; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm10, %ymm9, %ymm9 -; AVX512VL-NEXT: vpsrlw $8, %ymm9, %ymm9 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] -; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm1, %ymm11, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm3, %ymm5, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4 +; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] +; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmullw %ymm6, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpackuswb %ymm9, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3 -; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] -; AVX512VL-NEXT: vpmullw %ymm3, %ymm10, %ymm3 -; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] -; AVX512VL-NEXT: vpmullw %ymm0, %ymm11, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31] +; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23] +; AVX512VL-NEXT: vpmullw %ymm6, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v64i8: @@ -746,26 +870,26 @@ define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind { ; AVX512F-LABEL: splatconstant_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2 -; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2 +; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpsrlw $9, %ymm2, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2 -; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm1 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $9, %ymm2, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm2 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i16: @@ -790,30 +914,36 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -146,72 +146,72 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero -; AVX512F-NEXT: vpsrlvd %zmm7, %zmm8, %zmm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %ymm5, %ymm8, %ymm9 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpsllvd %zmm9, %zmm3, %zmm3 -; AVX512F-NEXT: vpord %zmm7, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpsrlvd %zmm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpsubw %ymm2, %ymm8, %ymm5 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vpsrlvd %zmm6, %zmm4, %zmm4 +; AVX512F-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm6 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpsllvd %zmm6, %zmm7, %zmm6 +; AVX512F-NEXT: vpmovdw %zmm6, %ymm6 +; AVX512F-NEXT: vpsubw %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpsllvd %zmm5, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpcmpeqw %ymm0, %ymm5, %ymm4 +; AVX512F-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 +; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm7, %zmm8, %zmm7 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %ymm5, %ymm8, %ymm9 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512VL-NEXT: vpsllvd %zmm9, %zmm3, %zmm3 -; AVX512VL-NEXT: vpord %zmm7, %zmm3, %zmm3 -; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 -; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm5, %zmm4 -; AVX512VL-NEXT: vpsubw %ymm2, %ymm8, %ymm5 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm6, %zmm4, %zmm4 +; AVX512VL-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm6 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpsllvd %zmm6, %zmm7, %zmm6 +; AVX512VL-NEXT: vpmovdw %zmm6, %ymm6 +; AVX512VL-NEXT: vpsubw %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpsllvd %zmm5, %zmm0, %zmm0 -; AVX512VL-NEXT: vpord %zmm4, %zmm0, %zmm0 +; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpcmpeqw %ymm0, %ymm5, %ymm4 +; AVX512VL-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v32i16: @@ -256,146 +256,142 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm8 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm6 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm9 -; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm10 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm7, %ymm3, %ymm7 -; AVX512F-NEXT: vpsrlw $2, %ymm7, %ymm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX512F-NEXT: vpand %ymm6, %ymm11, %ymm11 -; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm11 -; AVX512F-NEXT: vpsrlw $1, %ymm11, %ymm12 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm7, %ymm12, %ymm12 -; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm12, %ymm11, %ymm10 -; AVX512F-NEXT: vpsllw $4, %ymm8, %ymm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpand %ymm12, %ymm11, %ymm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %ymm9, %ymm13, %ymm14 -; AVX512F-NEXT: vpsllw $5, %ymm14, %ymm14 -; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8 -; AVX512F-NEXT: vpsllw $2, %ymm8, %ymm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512F-NEXT: vpand %ymm15, %ymm11, %ymm11 -; AVX512F-NEXT: vpaddb %ymm14, %ymm14, %ymm14 -; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8 -; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm11 -; AVX512F-NEXT: vpaddb %ymm14, %ymm14, %ymm14 -; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8 -; AVX512F-NEXT: vpor %ymm10, %ymm8, %ymm8 -; AVX512F-NEXT: vpxor %xmm10, %xmm10, %xmm10 -; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm9, %ymm9 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm8 -; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4 -; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm5 -; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm4 -; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm8 -; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 -; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm6 -; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 -; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 -; AVX512F-NEXT: vpand %ymm5, %ymm12, %ymm5 -; AVX512F-NEXT: vpsubb %ymm2, %ymm13, %ymm6 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm6 +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm8 +; AVX512F-NEXT: vpsllw $5, %ymm8, %ymm8 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6 +; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm6 +; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm6 ; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm5 -; AVX512F-NEXT: vpand %ymm5, %ymm15, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm5 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm10, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vpsrlw $4, %ymm4, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm7 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5 +; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm5 +; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm5 +; AVX512F-NEXT: vpsrlw $2, %ymm5, %ymm7 +; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm7 +; AVX512F-NEXT: vpand %ymm7, %ymm9, %ymm7 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-NEXT: vporq %zmm4, %zmm0, %zmm4 +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpcmpeqb %ymm0, %ymm3, %ymm3 +; AVX512F-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $202, %zmm4, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v64i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm6 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 -; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm6 -; AVX512VL-NEXT: vpsrlw $2, %ymm6, %ymm10 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX512VL-NEXT: vpand %ymm11, %ymm10, %ymm10 -; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6 -; AVX512VL-NEXT: vpsrlw $1, %ymm6, %ymm10 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-NEXT: vpand %ymm12, %ymm10, %ymm10 -; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6 -; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm9 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %ymm5, %ymm13, %ymm14 -; AVX512VL-NEXT: vpsllw $5, %ymm14, %ymm14 -; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm9 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm15 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpand %ymm15, %ymm9, %ymm9 -; AVX512VL-NEXT: vpaddb %ymm14, %ymm14, %ymm14 -; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm9 -; AVX512VL-NEXT: vpaddb %ymm14, %ymm14, %ymm14 -; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4 -; AVX512VL-NEXT: vpor %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4 -; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpand %ymm2, %ymm8, %ymm2 -; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm4 -; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm7 -; AVX512VL-NEXT: vpand %ymm7, %ymm11, %ymm7 -; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm7 -; AVX512VL-NEXT: vpand %ymm7, %ymm12, %ymm7 -; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm10, %ymm5 -; AVX512VL-NEXT: vpsubb %ymm2, %ymm13, %ymm7 -; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm6 +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %ymm3, %ymm7, %ymm8 +; AVX512VL-NEXT: vpsllw $5, %ymm8, %ymm8 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5 +; AVX512VL-NEXT: vpsubb %ymm2, %ymm7, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm15, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm7 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm5 +; AVX512VL-NEXT: vpsrlw $2, %ymm5, %ymm7 +; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsrlw $1, %ymm5, %ymm7 +; AVX512VL-NEXT: vpand %ymm7, %ymm9, %ymm7 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512VL-NEXT: vporq %zmm4, %zmm0, %zmm4 +; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpcmpeqb %ymm0, %ymm3, %ymm3 +; AVX512VL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vpternlogq $202, %zmm4, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v64i8: @@ -710,48 +706,58 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm5, %ymm4, %ymm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %xmm2, %xmm7, %xmm7 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpor %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw %xmm5, %ymm1, %ymm4 -; AVX512F-NEXT: vpsllw %xmm7, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpsubw %xmm4, %xmm5, %xmm6 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm7 +; AVX512F-NEXT: vpsllw %xmm6, %ymm7, %ymm6 +; AVX512F-NEXT: vpsubw %xmm2, %xmm5, %xmm5 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpcmpeqw %ymm0, %ymm4, %ymm4 +; AVX512F-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm5, %ymm4, %ymm6 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %xmm2, %xmm7, %xmm7 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm7, %ymm3, %ymm3 -; AVX512VL-NEXT: vpor %ymm6, %ymm3, %ymm3 -; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm5, %ymm1, %ymm4 -; AVX512VL-NEXT: vpsllw %xmm7, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpsubw %xmm4, %xmm5, %xmm6 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm7 +; AVX512VL-NEXT: vpsllw %xmm6, %ymm7, %ymm6 +; AVX512VL-NEXT: vpsubw %xmm2, %xmm5, %xmm5 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpcmpeqw %ymm0, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: @@ -805,68 +811,82 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm9 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm5, %ymm4, %ymm6 -; AVX512F-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 -; AVX512F-NEXT: vpsrlw %xmm5, %xmm8, %xmm7 -; AVX512F-NEXT: vpsrlw $8, %xmm7, %xmm7 -; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7 -; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm10 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm3, %ymm9, %ymm9 -; AVX512F-NEXT: vpsllw %xmm3, %xmm8, %xmm6 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX512F-NEXT: vpsrlw %xmm3, %xmm5, %xmm6 +; AVX512F-NEXT: vpsrlw $8, %xmm6, %xmm6 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6 -; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm8 -; AVX512F-NEXT: vpor %ymm10, %ymm8, %ymm8 -; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm9, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4 -; AVX512F-NEXT: vpsrlw %xmm5, %ymm1, %ymm5 -; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpsubb %xmm4, %xmm6, %xmm7 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm8 +; AVX512F-NEXT: vpsllw %xmm7, %ymm8, %ymm8 +; AVX512F-NEXT: vpsllw %xmm7, %xmm5, %xmm7 +; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7 +; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7 +; AVX512F-NEXT: vpsubb %xmm2, %xmm6, %xmm6 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsllw %xmm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw %xmm6, %xmm5, %xmm5 +; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpcmpeqb %ymm0, %ymm4, %ymm4 +; AVX512F-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm9 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm5, %ymm4, %ymm6 -; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 -; AVX512VL-NEXT: vpsrlw %xmm5, %xmm8, %xmm7 -; AVX512VL-NEXT: vpsrlw $8, %xmm7, %xmm7 -; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7 -; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm10 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm9, %ymm9 -; AVX512VL-NEXT: vpsllw %xmm3, %xmm8, %xmm6 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX512VL-NEXT: vpsrlw %xmm3, %xmm5, %xmm6 +; AVX512VL-NEXT: vpsrlw $8, %xmm6, %xmm6 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6 -; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm8 -; AVX512VL-NEXT: vpor %ymm10, %ymm8, %ymm8 -; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX512VL-NEXT: vpcmpeqb %ymm2, %ymm9, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4 -; AVX512VL-NEXT: vpsrlw %xmm5, %ymm1, %ymm5 -; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %xmm4, %xmm6, %xmm7 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm8 +; AVX512VL-NEXT: vpsllw %xmm7, %ymm8, %ymm8 +; AVX512VL-NEXT: vpsllw %xmm7, %xmm5, %xmm7 +; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7 +; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7 +; AVX512VL-NEXT: vpsubb %xmm2, %xmm6, %xmm6 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm6, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw %xmm6, %xmm5, %xmm5 +; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpcmpeqb %ymm0, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: @@ -1060,17 +1080,17 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; AVX512F-LABEL: constant_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm5 -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vpmulhuw %ymm4, %ymm1, %ymm3 -; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm5 +; AVX512F-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm4 +; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -1078,17 +1098,17 @@ ; ; AVX512VL-LABEL: constant_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm5 -; AVX512VL-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm1, %ymm3 -; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm2, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512VL-NEXT: vpmullw %ymm3, %ymm5, %ymm5 +; AVX512VL-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm1, %ymm4 +; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -1132,49 +1152,49 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F-LABEL: constant_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] -; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm8 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4 -; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm9 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm7 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3 +; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm3[8],ymm9[8],ymm3[9],ymm9[9],ymm3[10],ymm9[10],ymm3[11],ymm9[11],ymm3[12],ymm9[12],ymm3[13],ymm9[13],ymm3[14],ymm9[14],ymm3[15],ymm9[15],ymm3[24],ymm9[24],ymm3[25],ymm9[25],ymm3[26],ymm9[26],ymm3[27],ymm9[27],ymm3[28],ymm9[28],ymm3[29],ymm9[29],ymm3[30],ymm9[30],ymm3[31],ymm9[31] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512F-NEXT: vpmullw %ymm11, %ymm10, %ymm10 ; AVX512F-NEXT: vpsrlw $8, %ymm10, %ymm10 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm3[0],ymm9[0],ymm3[1],ymm9[1],ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[4],ymm9[4],ymm3[5],ymm9[5],ymm3[6],ymm9[6],ymm3[7],ymm9[7],ymm3[16],ymm9[16],ymm3[17],ymm9[17],ymm3[18],ymm9[18],ymm3[19],ymm9[19],ymm3[20],ymm9[20],ymm3[21],ymm9[21],ymm3[22],ymm9[22],ymm3[23],ymm9[23] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512F-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512F-NEXT: vpmullw %ymm13, %ymm12, %ymm12 ; AVX512F-NEXT: vpsrlw $8, %ymm12, %ymm12 ; AVX512F-NEXT: vpackuswb %ymm10, %ymm12, %ymm10 -; AVX512F-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512F-NEXT: vpor %ymm2, %ymm10, %ymm2 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] -; AVX512F-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3 -; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3 -; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31] +; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm9[8],ymm1[9],ymm9[9],ymm1[10],ymm9[10],ymm1[11],ymm9[11],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15],ymm1[24],ymm9[24],ymm1[25],ymm9[25],ymm1[26],ymm9[26],ymm1[27],ymm9[27],ymm1[28],ymm9[28],ymm1[29],ymm9[29],ymm1[30],ymm9[30],ymm1[31],ymm9[31] ; AVX512F-NEXT: vpmullw %ymm3, %ymm11, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[16],ymm9[16],ymm1[17],ymm9[17],ymm1[18],ymm9[18],ymm1[19],ymm9[19],ymm1[20],ymm9[20],ymm1[21],ymm9[21],ymm1[22],ymm9[22],ymm1[23],ymm9[23] ; AVX512F-NEXT: vpmullw %ymm4, %ymm13, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 @@ -1185,49 +1205,49 @@ ; ; AVX512VL-LABEL: constant_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] -; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm8 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm7 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm3[8],ymm9[8],ymm3[9],ymm9[9],ymm3[10],ymm9[10],ymm3[11],ymm9[11],ymm3[12],ymm9[12],ymm3[13],ymm9[13],ymm3[14],ymm9[14],ymm3[15],ymm9[15],ymm3[24],ymm9[24],ymm3[25],ymm9[25],ymm3[26],ymm9[26],ymm3[27],ymm9[27],ymm3[28],ymm9[28],ymm3[29],ymm9[29],ymm3[30],ymm9[30],ymm3[31],ymm9[31] ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] ; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512VL-NEXT: vpmullw %ymm11, %ymm10, %ymm10 ; AVX512VL-NEXT: vpsrlw $8, %ymm10, %ymm10 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm3[0],ymm9[0],ymm3[1],ymm9[1],ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[4],ymm9[4],ymm3[5],ymm9[5],ymm3[6],ymm9[6],ymm3[7],ymm9[7],ymm3[16],ymm9[16],ymm3[17],ymm9[17],ymm3[18],ymm9[18],ymm3[19],ymm9[19],ymm3[20],ymm9[20],ymm3[21],ymm9[21],ymm3[22],ymm9[22],ymm3[23],ymm9[23] ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512VL-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512VL-NEXT: vpmullw %ymm13, %ymm12, %ymm12 ; AVX512VL-NEXT: vpsrlw $8, %ymm12, %ymm12 ; AVX512VL-NEXT: vpackuswb %ymm10, %ymm12, %ymm10 -; AVX512VL-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512VL-NEXT: vpor %ymm2, %ymm10, %ymm2 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] -; AVX512VL-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31] +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm9[8],ymm1[9],ymm9[9],ymm1[10],ymm9[10],ymm1[11],ymm9[11],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15],ymm1[24],ymm9[24],ymm1[25],ymm9[25],ymm1[26],ymm9[26],ymm1[27],ymm9[27],ymm1[28],ymm9[28],ymm1[29],ymm9[29],ymm1[30],ymm9[30],ymm1[31],ymm9[31] ; AVX512VL-NEXT: vpmullw %ymm3, %ymm11, %ymm3 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23] +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[16],ymm9[16],ymm1[17],ymm9[17],ymm1[18],ymm9[18],ymm1[19],ymm9[19],ymm1[20],ymm9[20],ymm1[21],ymm9[21],ymm1[22],ymm9[22],ymm1[23],ymm9[23] ; AVX512VL-NEXT: vpmullw %ymm4, %ymm13, %ymm4 ; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 @@ -1450,28 +1470,28 @@ define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; AVX512F-LABEL: splatconstant_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm3 -; AVX512F-NEXT: vpsllw $9, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsllw $9, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i16: @@ -1504,34 +1524,40 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F-LABEL: splatconstant_funnnel_v64i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpandn %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpandn %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm4, %ymm2 -; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $226, %ymm1, %ymm4, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -34,62 +34,64 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vpsubw %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpsllvd %zmm6, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %ymm3, %ymm7, %ymm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512F-NEXT: vpsrlvd %zmm4, %zmm2, %zmm4 +; AVX512F-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpord %zmm2, %zmm6, %zmm2 +; AVX512F-NEXT: vpsrlvd %zmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vpsllvd %zmm5, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 -; AVX512F-NEXT: vpsubw %ymm1, %ymm7, %ymm1 +; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0 +; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpsubw %ymm3, %ymm4, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpsllvd %zmm6, %zmm2, %zmm6 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %ymm3, %ymm7, %ymm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm2, %zmm4 +; AVX512VL-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm3 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 -; AVX512VL-NEXT: vpord %zmm2, %zmm6, %zmm2 +; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512VL-NEXT: vpsllvd %zmm5, %zmm2, %zmm2 ; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm1 -; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 -; AVX512VL-NEXT: vpsubw %ymm1, %ymm7, %ymm1 +; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0 +; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v32i16: @@ -122,100 +124,138 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6 -; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6 -; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512F-NEXT: vpandn %ymm4, %ymm7, %ymm4 +; AVX512F-NEXT: vpsubb %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3 ; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm8 -; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm8 -; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm9, %ymm8, %ymm8 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm8 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm5 +; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4 +; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm6 +; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm6 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm9 -; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512F-NEXT: vpandn %ymm3, %ymm7, %ymm3 -; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4 -; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5 -; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpsubb %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm5 -; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm7 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm7 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5 +; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-NEXT: vpsubb %ymm5, %ymm6, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm9 -; AVX512VL-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm8 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm9, %ymm8, %ymm8 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm5 -; AVX512VL-NEXT: vpsubb %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpand %ymm4, %ymm5, %ymm4 +; AVX512VL-NEXT: vpsubb %ymm1, %ymm6, %ymm5 +; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm6 +; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $2, %ymm2, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $1, %ymm2, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm8, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 -; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v64i8: @@ -326,42 +366,60 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 +; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vpsrlw %xmm1, %ymm3, %ymm4 +; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpsllvd %zmm5, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512F-NEXT: vpsllvd %zmm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %ymm2 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm4 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpsllvd %zmm5, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm2 +; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512VL-NEXT: vpsllvd %zmm2, %zmm3, %zmm2 +; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: @@ -401,62 +459,104 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm6 +; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX512F-NEXT: vpsubb %ymm6, %ymm7, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm4 +; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm9 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm10, %ymm9, %ymm9 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm9 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vpsubb %ymm3, %ymm7, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm0, %ymm5 +; AVX512F-NEXT: vpsllw $2, %ymm5, %ymm6 +; AVX512F-NEXT: vpand %ymm6, %ymm10, %ymm6 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm6 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6 -; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6 -; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 -; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512F-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 +; AVX512F-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm2 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm3 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm6 +; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX512VL-NEXT: vpsubb %ymm6, %ymm7, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm4 +; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm9 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm9 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5 +; AVX512VL-NEXT: vpsubb %ymm3, %ymm7, %ymm3 +; AVX512VL-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm0, %ymm5 +; AVX512VL-NEXT: vpsllw $2, %ymm5, %ymm6 +; AVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6 -; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6 -; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 -; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 +; AVX512VL-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: @@ -539,15 +639,15 @@ ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] -; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] +; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v32i16: @@ -557,15 +657,15 @@ ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1,2,3,4,5,6,7] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] -; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] +; AVX512VL-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v32i16: @@ -603,36 +703,36 @@ ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] -; AVX512F-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9 -; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] -; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpmullw %ymm1, %ymm11, %ymm1 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3 -; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] -; AVX512F-NEXT: vpmullw %ymm3, %ymm10, %ymm3 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] -; AVX512F-NEXT: vpmullw %ymm0, %ymm11, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31] +; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23] +; AVX512F-NEXT: vpmullw %ymm6, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v64i8: @@ -652,36 +752,36 @@ ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] -; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm10, %ymm9, %ymm9 -; AVX512VL-NEXT: vpsrlw $8, %ymm9, %ymm9 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] -; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm1, %ymm11, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm3, %ymm5, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4 +; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] +; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmullw %ymm6, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpackuswb %ymm9, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3 -; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] -; AVX512VL-NEXT: vpmullw %ymm3, %ymm10, %ymm3 -; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] -; AVX512VL-NEXT: vpmullw %ymm0, %ymm11, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31] +; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23] +; AVX512VL-NEXT: vpmullw %ymm6, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v64i8: @@ -766,26 +866,26 @@ define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind { ; AVX512F-LABEL: splatconstant_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2 -; AVX512F-NEXT: vpsllw $9, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 +; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsllw $9, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2 -; AVX512VL-NEXT: vpsllw $9, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm1 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpsllw $9, %ymm2, %ymm2 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i16: @@ -810,30 +910,36 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -447,15 +447,16 @@ ; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4 ; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm4 +; AVX512F-NEXT: vpsubw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm3 ; AVX512F-NEXT: vpsraw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -103,19 +103,19 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: test_div7_32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] +; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_div7_32i16: @@ -439,16 +439,17 @@ ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddw %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm4 +; AVX512F-NEXT: vpsubw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512.ll b/llvm/test/CodeGen/X86/vector-popcnt-512.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-512.ll @@ -130,28 +130,28 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512F-LABEL: testv32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm3 -; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 +; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 +; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm4 +; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm4 +; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm2 -; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: testv32i16: @@ -171,14 +171,14 @@ ; ; AVX512VPOPCNTDQ-NOBW-LABEL: testv32i16: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: retq ; ; AVX512VPOPCNTDQ-BW-LABEL: testv32i16: diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -1600,9 +1600,10 @@ ; AVX512F-LABEL: icmp_v32i16_v32i1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 @@ -1689,6 +1690,7 @@ ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 ; AVX512F-NEXT: kandw %k1, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -1489,8 +1489,10 @@ ; ; AVX512DQ-LABEL: test_v64i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1507,8 +1509,10 @@ ; ; AVX512DQVL-LABEL: test_v64i16: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1 +; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2359,11 +2363,10 @@ ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-NEXT: vpmullw %xmm0, %xmm3, %xmm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512DQ-NEXT: vpmullw %xmm0, %xmm1, %xmm0 ; AVX512DQ-NEXT: vpmullw %xmm0, %xmm2, %xmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 @@ -2395,11 +2398,10 @@ ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQVL-NEXT: vpmullw %xmm0, %xmm3, %xmm0 +; AVX512DQVL-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512DQVL-NEXT: vpmullw %xmm0, %xmm1, %xmm0 ; AVX512DQVL-NEXT: vpmullw %xmm0, %xmm2, %xmm0 +; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 @@ -2750,39 +2752,40 @@ ; ; AVX512DQ-LABEL: test_v128i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] +; AVX512DQ-NEXT: vpmullw %ymm3, %ymm5, %ymm3 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] -; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX512DQ-NEXT: vpmullw %ymm3, %ymm5, %ymm3 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmullw %ymm3, %ymm5, %ymm3 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm4, %ymm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm4, %xmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm3, %xmm1 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm2, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2793,39 +2796,40 @@ ; ; AVX512DQVL-LABEL: test_v128i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] +; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm5, %ymm3 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] -; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm5, %ymm3 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm5, %ymm3 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] +; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm4, %ymm2 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm4, %xmm1 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQVL-NEXT: vextracti128 $1, %ymm3, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm3, %xmm1 +; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm2, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -1688,6 +1688,7 @@ ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -1938,6 +1938,7 @@ ; AVX512F-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 ; AVX512F-NEXT: kxorw %k1, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -35,56 +35,54 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512F-LABEL: var_rotate_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpsllvd %zmm5, %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vpsubw %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpsubw %ymm1, %ymm2, %ymm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpsllvd %zmm1, %zmm5, %zmm1 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpord %zmm2, %zmm5, %zmm2 -; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 -; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpsrlvd %zmm2, %zmm5, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512F-NEXT: vpsrlvd %zmm3, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_rotate_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpsllvd %zmm5, %zmm2, %zmm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %ymm3, %ymm6, %ymm3 +; AVX512VL-NEXT: vpsubw %ymm3, %ymm2, %ymm4 +; AVX512VL-NEXT: vpsubw %ymm1, %ymm2, %ymm2 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpsllvd %zmm1, %zmm5, %zmm1 +; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 -; AVX512VL-NEXT: vpord %zmm2, %zmm5, %zmm2 -; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2 -; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 -; AVX512VL-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0 +; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm5, %zmm2 +; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_rotate_v32i16: @@ -114,94 +112,128 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512F-LABEL: var_rotate_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6 -; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6 -; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512F-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm7 -; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm7 -; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm8 -; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpsubb %ymm1, %ymm3, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vpsubb %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512F-NEXT: vpsllw $4, %ymm5, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm6 +; AVX512F-NEXT: vpsllw $2, %ymm6, %ymm8 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm9, %ymm8, %ymm8 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm8, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm8 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm8, %ymm6, %ymm4 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512F-NEXT: vpandn %ymm3, %ymm6, %ymm3 -; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm6, %ymm0, %ymm6 +; AVX512F-NEXT: vpsllw $2, %ymm6, %ymm7 +; AVX512F-NEXT: vpand %ymm7, %ymm9, %ymm7 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4 -; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm7 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm7, %ymm6, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $4, %ymm5, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_rotate_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5 -; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm7, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8 -; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %ymm1, %ymm3, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512VL-NEXT: vpsubb %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512VL-NEXT: vpsllw $4, %ymm5, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm6 +; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm8 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm9, %ymm8, %ymm8 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm8, %ymm6, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm8 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm8, %ymm6, %ymm4 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm7, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm6, %ymm0, %ymm6 +; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm7 +; AVX512VL-NEXT: vpand %ymm7, %ymm9, %ymm7 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 -; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm7 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm7, %ymm6, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm5, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4 +; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_rotate_v64i8: @@ -315,38 +347,40 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512F-LABEL: splatvar_rotate_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm4 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1 +; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm5 +; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-NEXT: vpsrlw %xmm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsrlw %xmm4, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_rotate_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm4 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1 +; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm5 +; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm2 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpsrlw %xmm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_rotate_v32i16: @@ -381,58 +415,66 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512F-LABEL: splatvar_rotate_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm4 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6 -; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6 -; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1 +; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 -; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm5 +; AVX512F-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsllw %xmm2, %xmm6, %xmm7 +; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7 +; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-NEXT: vpsrlw %xmm1, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw %xmm1, %xmm6, %xmm1 +; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsrlw %xmm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw %xmm4, %xmm6, %xmm3 +; AVX512F-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX512F-NEXT: vpbroadcastb %xmm3, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_rotate_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm4 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6 -; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6 -; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1 +; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 -; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm5 +; AVX512VL-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 +; AVX512VL-NEXT: vpsllw %xmm2, %xmm6, %xmm7 +; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7 +; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm2 +; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm6, %xmm1 +; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpsrlw %xmm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw %xmm4, %xmm6, %xmm3 +; AVX512VL-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_rotate_v64i8: @@ -511,26 +553,26 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_rotate_v32i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] -; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 -; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 -; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm4 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_rotate_v32i16: @@ -559,7 +601,7 @@ ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5 @@ -570,36 +612,36 @@ ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] -; AVX512F-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9 -; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] -; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpmullw %ymm1, %ymm11, %ymm1 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3 -; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] -; AVX512F-NEXT: vpmullw %ymm3, %ymm10, %ymm3 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] -; AVX512F-NEXT: vpmullw %ymm0, %ymm11, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31] +; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23] +; AVX512F-NEXT: vpmullw %ymm6, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_rotate_v64i8: @@ -608,7 +650,7 @@ ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5 @@ -619,36 +661,36 @@ ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] -; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm10, %ymm9, %ymm9 -; AVX512VL-NEXT: vpsrlw $8, %ymm9, %ymm9 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] -; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm1, %ymm11, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm3, %ymm5, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4 +; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmullw %ymm6, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpackuswb %ymm9, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3 -; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] -; AVX512VL-NEXT: vpmullw %ymm3, %ymm10, %ymm3 -; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] -; AVX512VL-NEXT: vpmullw %ymm0, %ymm11, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31] +; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23] +; AVX512VL-NEXT: vpmullw %ymm6, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_rotate_v64i8: @@ -739,26 +781,26 @@ define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: splatconstant_rotate_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2 -; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2 -; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $9, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2 -; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2 -; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm1 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $9, %ymm2, %ymm2 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_v32i16: @@ -784,31 +826,37 @@ ; AVX512F-LABEL: splatconstant_rotate_v64i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_v64i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_v64i8: @@ -865,32 +913,30 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] -; AVX512F-NEXT: vpsrlw $11, %ymm1, %ymm3 -; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm3 -; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $11, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] -; AVX512VL-NEXT: vpsrlw $11, %ymm1, %ymm3 -; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm3 -; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $11, %ymm2, %ymm2 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16: @@ -922,37 +968,41 @@ ; AVX512F-LABEL: splatconstant_rotate_mask_v64i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpandn %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm1 +; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] -; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm3, %ymm0 -; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm1 +; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8: diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -181,10 +181,10 @@ ; ; AVX512F-LABEL: sext_32i8_to_32i16: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: sext_32i8_to_32i16: diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -27,17 +27,17 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512DQ-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512DQ-NEXT: vpsravd %zmm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm3 +; AVX512DQ-NEXT: vpsravd %zmm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v32i16: @@ -51,32 +51,32 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512DQ-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15],ymm0[24],ymm4[24],ymm0[25],ymm4[25],ymm0[26],ymm4[26],ymm0[27],ymm4[27],ymm0[28],ymm4[28],ymm0[29],ymm4[29],ymm0[30],ymm4[30],ymm0[31],ymm4[31] ; AVX512DQ-NEXT: vpsraw $4, %ymm5, %ymm6 -; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpsraw $2, %ymm5, %ymm6 -; AVX512DQ-NEXT: vpaddw %ymm4, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpsraw $1, %ymm5, %ymm6 -; AVX512DQ-NEXT: vpaddw %ymm4, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm4 -; AVX512DQ-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] -; AVX512DQ-NEXT: vpsraw $4, %ymm2, %ymm5 -; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpsraw $2, %ymm2, %ymm5 ; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpsraw $1, %ymm2, %ymm5 +; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpsraw $1, %ymm5, %ymm6 ; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm3 +; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23] +; AVX512DQ-NEXT: vpsraw $4, %ymm4, %ymm5 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpsraw $2, %ymm4, %ymm5 +; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpsraw $1, %ymm4, %ymm5 +; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm2 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpackuswb %ymm4, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] @@ -169,8 +169,8 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpsraw %xmm1, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -189,8 +189,8 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 @@ -252,15 +252,15 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpsravd %zmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpsravd %zmm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v32i16: @@ -341,10 +341,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpsraw $3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsraw $3, %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpsraw $3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v32i16: diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -27,17 +27,17 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512DQ-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v32i16: @@ -51,25 +51,25 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpsrlw $2, %ymm3, %ymm4 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpsrlw $4, %ymm2, %ymm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5 +; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsrlw $2, %ymm2, %ymm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX512DQ-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpsrlw $1, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsrlw $1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512DQ-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm3 @@ -133,8 +133,8 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -153,8 +153,8 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 @@ -291,10 +291,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v32i16: diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll @@ -27,17 +27,17 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512DQ-NEXT: vpsllvd %zmm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpsllvd %zmm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v32i16: @@ -51,23 +51,23 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQ-NEXT: vpsllw $4, %ymm3, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpsllw $2, %ymm3, %ymm4 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpsllw $4, %ymm2, %ymm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5 +; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsllw $2, %ymm2, %ymm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512DQ-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpaddb %ymm3, %ymm3, %ymm4 -; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm3 +; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm3 @@ -128,8 +128,8 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -148,8 +148,8 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpsllw %xmm1, %xmm3, %xmm3 @@ -289,10 +289,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpsllw $3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v32i16: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -42,9 +42,9 @@ ; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19] ; KNL-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31] -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u> ; KNL-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm3 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7],ymm3[8,9,10,11,12,13,14],ymm0[15] ; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19] @@ -130,10 +130,10 @@ define <32 x i16> @shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31: ; KNL: ## %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] -; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31: @@ -147,10 +147,10 @@ define <32 x i16> @shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28: ; KNL: ## %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] -; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28: @@ -164,12 +164,12 @@ define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28: ; KNL: ## %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] ; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] ; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] -; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28: @@ -221,9 +221,9 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpsrad $25, %zmm0, %zmm0 ; KNL-NEXT: vpsrad $25, %zmm1, %zmm1 -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; KNL-NEXT: vpackssdw %ymm3, %ymm2, %ymm2 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; KNL-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 ; KNL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; KNL-NEXT: retq @@ -247,9 +247,9 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vpsrld $25, %zmm0, %zmm0 ; KNL-NEXT: vpsrld $25, %zmm1, %zmm1 -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; KNL-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; KNL-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 ; KNL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; KNL-NEXT: retq @@ -435,10 +435,10 @@ define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i16> %a) { ; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: ; KNL: ## %bb.0: -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; KNL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: @@ -452,10 +452,10 @@ define <32 x i16> @shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i16> %a) { ; KNL-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz: ; KNL: ## %bb.0: -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz: @@ -500,10 +500,10 @@ define <32 x i16> @shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz(<32 x i16> %a) { ; KNL-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz: ; KNL: ## %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[14,15],zero,zero,ymm1[10,11],zero,zero,ymm1[6,7],zero,zero,ymm1[2,3],zero,zero,ymm1[30,31],zero,zero,ymm1[26,27],zero,zero,ymm1[22,23],zero,zero,ymm1[20,21],zero,zero -; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15],zero,zero,ymm0[10,11],zero,zero,ymm0[6,7],zero,zero,ymm0[2,3],zero,zero,ymm0[30,31],zero,zero,ymm0[26,27],zero,zero,ymm0[22,23],zero,zero,ymm0[18,19],zero,zero -; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[14,15],zero,zero,ymm0[10,11],zero,zero,ymm0[6,7],zero,zero,ymm0[2,3],zero,zero,ymm0[30,31],zero,zero,ymm0[26,27],zero,zero,ymm0[22,23],zero,zero,ymm0[18,19],zero,zero +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15],zero,zero,ymm0[10,11],zero,zero,ymm0[6,7],zero,zero,ymm0[2,3],zero,zero,ymm0[30,31],zero,zero,ymm0[26,27],zero,zero,ymm0[22,23],zero,zero,ymm0[20,21],zero,zero +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -16,10 +16,10 @@ define <64 x i8> @shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512F-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: @@ -29,10 +29,10 @@ ; ; AVX512DQ-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: @@ -46,10 +46,10 @@ define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero +; AVX512F-NEXT: vpsrldq {{.*#+}} ymm1 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz: @@ -59,10 +59,10 @@ ; ; AVX512DQ-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero +; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz: @@ -163,11 +163,11 @@ define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] ; AVX512F-NEXT: retq ; @@ -179,11 +179,11 @@ ; ; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] ; AVX512DQ-NEXT: retq ; @@ -366,10 +366,10 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz: @@ -379,10 +379,10 @@ ; ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz: @@ -396,10 +396,10 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz: @@ -409,10 +409,10 @@ ; ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz: @@ -426,10 +426,10 @@ define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz: @@ -439,10 +439,10 @@ ; ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz: @@ -456,13 +456,13 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] -; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: @@ -473,13 +473,13 @@ ; ; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: @@ -495,17 +495,17 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] +; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: @@ -524,17 +524,17 @@ ; ; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: @@ -551,9 +551,9 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrad $25, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrad $25, %zmm1, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpackssdw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -569,9 +569,9 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsrad $25, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrad $25, %zmm1, %zmm1 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512DQ-NEXT: vpackssdw %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 ; AVX512DQ-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq @@ -595,13 +595,13 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrad $25, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrad $25, %zmm1, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpackssdw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpacksswb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpacksswb %ymm0, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124: @@ -616,13 +616,13 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsrad $25, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrad $25, %zmm1, %zmm1 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512DQ-NEXT: vpackssdw %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpackssdw %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpacksswb %ymm2, %ymm2, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpacksswb %ymm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124: @@ -645,9 +645,9 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $25, %zmm1, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -663,9 +663,9 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsrld $25, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $25, %zmm1, %zmm1 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512DQ-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 ; AVX512DQ-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq @@ -689,13 +689,13 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $25, %zmm1, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpackuswb %ymm0, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124: @@ -710,13 +710,13 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsrld $25, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $25, %zmm1, %zmm1 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512DQ-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpackusdw %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm2, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpackuswb %ymm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124: @@ -737,11 +737,11 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) { ; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 @@ -760,11 +760,11 @@ ; ; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 @@ -788,15 +788,15 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) { ; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126: @@ -808,15 +808,15 @@ ; ; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm3 +; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm1 -; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm3, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -267,50 +267,44 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) { ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm5, %ymm0 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm4 +; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 +; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm6 -; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 +; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5 -; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm5, %ymm0 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm4 +; AVX512VL-NEXT: vpmovsxwd %ymm4, %zmm4 +; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512VL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; AVX512VL-NEXT: vpermi2d %zmm0, %zmm5, %zmm6 -; AVX512VL-NEXT: vptestmd %zmm6, %zmm6, %k1 +; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 +; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VL-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512VL-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: @@ -389,38 +383,32 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) { ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm5 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 -; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm5 ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 -; AVX512VL-NEXT: vptestmd %zmm6, %zmm6, %k1 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 +; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VL-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll @@ -260,33 +260,33 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512CD-LABEL: testv32i16: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm3 -; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 +; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 +; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm4 -; AVX512CD-NEXT: vpaddb %ymm1, %ymm4, %ymm1 -; AVX512CD-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2 +; AVX512CD-NEXT: vpsrlw $4, %ymm2, %ymm2 +; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512CD-NEXT: vpaddb %ymm4, %ymm2, %ymm2 +; AVX512CD-NEXT: vpsllw $8, %ymm2, %ymm4 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm4, %ymm2 +; AVX512CD-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm1 +; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm2 -; AVX512CD-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv32i16: @@ -327,19 +327,19 @@ ; ; AVX512VPOPCNTDQ-LABEL: testv32i16: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm2, %zmm2 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv32i16: @@ -356,33 +356,33 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512CD-LABEL: testv32i16u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm3 -; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 +; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 +; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm4 -; AVX512CD-NEXT: vpaddb %ymm1, %ymm4, %ymm1 -; AVX512CD-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2 +; AVX512CD-NEXT: vpsrlw $4, %ymm2, %ymm2 +; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512CD-NEXT: vpaddb %ymm4, %ymm2, %ymm2 +; AVX512CD-NEXT: vpsllw $8, %ymm2, %ymm4 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm4, %ymm2 +; AVX512CD-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm1 +; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm2 -; AVX512CD-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv32i16u: @@ -423,19 +423,19 @@ ; ; AVX512VPOPCNTDQ-LABEL: testv32i16u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm2, %zmm2 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv32i16u: diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -144,10 +144,10 @@ ; ; AVX512F-LABEL: zext_32i8_to_32i16: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: zext_32i8_to_32i16: diff --git a/llvm/test/CodeGen/X86/viabs.ll b/llvm/test/CodeGen/X86/viabs.ll --- a/llvm/test/CodeGen/X86/viabs.ll +++ b/llvm/test/CodeGen/X86/viabs.ll @@ -929,10 +929,10 @@ ; ; AVX512F-LABEL: test_abs_lt_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc1,0x01] -; AVX512F-NEXT: vpabsb %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc9] +; AVX512F-NEXT: vpabsb %ymm0, %ymm1 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc8] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc0,0x01] ; AVX512F-NEXT: vpabsb %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc0] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc1,0x01] +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 # encoding: [0x62,0xf3,0xf5,0x48,0x3a,0xc0,0x01] ; AVX512F-NEXT: retq # encoding: [0xc3] ; ; AVX512BW-LABEL: test_abs_lt_v64i8: @@ -1002,10 +1002,10 @@ ; ; AVX512F-LABEL: test_abs_gt_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc1,0x01] -; AVX512F-NEXT: vpabsw %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc9] +; AVX512F-NEXT: vpabsw %ymm0, %ymm1 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc8] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc0,0x01] ; AVX512F-NEXT: vpabsw %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc0] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc1,0x01] +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 # encoding: [0x62,0xf3,0xf5,0x48,0x3a,0xc0,0x01] ; AVX512F-NEXT: retq # encoding: [0xc3] ; ; AVX512BW-LABEL: test_abs_gt_v32i16: