Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18512,7 +18512,23 @@ return DAG.getBitcast(NVT, NewExtract); } } - // TODO - handle (DestNumElts % SrcNumElts) == 0 + if ((DestNumElts % SrcNumElts) == 0) { + unsigned DestSrcRatio = DestNumElts / SrcNumElts; + if ((NVT.getVectorNumElements() % DestSrcRatio) == 0) { + unsigned NewExtNumElts = NVT.getVectorNumElements() / DestSrcRatio; + EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), + SrcVT.getScalarType(), NewExtNumElts); + if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 && + TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { + unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; + SDLoc DL(N); + SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); + SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, + V.getOperand(0), NewIndex); + return DAG.getBitcast(NVT, NewExtract); + } + } + } } // Combine: Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -618,7 +618,6 @@ setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); - setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::STORE); if (Subtarget->supportsAddressTopByteIgnored()) @@ -10188,74 +10187,6 @@ return SDValue(); } -static SDValue performBitcastCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - // Wait 'til after everything is legalized to try this. That way we have - // legal vector types and such. - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - // Remove extraneous bitcasts around an extract_subvector. - // For example, - // (v4i16 (bitconvert - // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1))))) - // becomes - // (extract_subvector ((v8i16 ...), (i64 4))) - - // Only interested in 64-bit vectors as the ultimate result. - EVT VT = N->getValueType(0); - if (!VT.isVector() || VT.isScalableVector()) - return SDValue(); - if (VT.getSimpleVT().getSizeInBits() != 64) - return SDValue(); - // Is the operand an extract_subvector starting at the beginning or halfway - // point of the vector? A low half may also come through as an - // EXTRACT_SUBREG, so look for that, too. - SDValue Op0 = N->getOperand(0); - if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR && - !(Op0->isMachineOpcode() && - Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG)) - return SDValue(); - uint64_t idx = cast(Op0->getOperand(1))->getZExtValue(); - if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) { - if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0) - return SDValue(); - } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) { - if (idx != AArch64::dsub) - return SDValue(); - // The dsub reference is equivalent to a lane zero subvector reference. - idx = 0; - } - // Look through the bitcast of the input to the extract. - if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST) - return SDValue(); - SDValue Source = Op0->getOperand(0)->getOperand(0); - // If the source type has twice the number of elements as our destination - // type, we know this is an extract of the high or low half of the vector. - EVT SVT = Source->getValueType(0); - if (!SVT.isVector() || - SVT.getVectorNumElements() != VT.getVectorNumElements() * 2) - return SDValue(); - - LLVM_DEBUG( - dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n"); - - // Create the simplified form to just extract the low or high half of the - // vector directly rather than bothering with the bitcasts. - SDLoc dl(N); - unsigned NumElements = VT.getVectorNumElements(); - if (idx) { - SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); - } else { - SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32); - return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, - Source, SubReg), - 0); - } -} - static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -12456,8 +12387,6 @@ return performExtendCombine(N, DCI, DAG); case ISD::SIGN_EXTEND_INREG: return performSignExtendInRegCombine(N, DCI, DAG); - case ISD::BITCAST: - return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); case ISD::SELECT: Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45103,7 +45103,6 @@ SDValue InVec = N->getOperand(0); SDValue InVecBC = peekThroughBitcasts(InVec); EVT InVecVT = InVec.getValueType(); - EVT InVecBCVT = InVecBC.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && @@ -45147,31 +45146,6 @@ VT, SDLoc(N), InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements())); - // Try to move vector bitcast after extract_subv by scaling extraction index: - // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') - // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR - if (InVec != InVecBC && InVecBCVT.isVector()) { - unsigned SrcNumElts = InVecBCVT.getVectorNumElements(); - unsigned DestNumElts = InVecVT.getVectorNumElements(); - if ((DestNumElts % SrcNumElts) == 0) { - unsigned DestSrcRatio = DestNumElts / SrcNumElts; - if ((VT.getVectorNumElements() % DestSrcRatio) == 0) { - unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio; - EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), - InVecBCVT.getScalarType(), NewExtNumElts); - if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 && - TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { - unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; - SDLoc DL(N); - SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); - SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, - InVecBC, NewIndex); - return DAG.getBitcast(VT, NewExtract); - } - } - } - } - // If we are extracting from an insert into a zero vector, replace with a // smaller insert into zero if we don't access less than the original // subvector. Don't do this for i1 vectors. Index: llvm/test/CodeGen/AArch64/merge-store.ll =================================================================== --- llvm/test/CodeGen/AArch64/merge-store.ll +++ llvm/test/CodeGen/AArch64/merge-store.ll @@ -42,17 +42,10 @@ ; the fastness of unaligned accesses was not specified correctly. define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) { -; SPLITTING-LABEL: merge_vec_extract_stores: -; SPLITTING: // %bb.0: -; SPLITTING-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; SPLITTING-NEXT: str d0, [x0, #24] -; SPLITTING-NEXT: str d1, [x0, #32] -; SPLITTING-NEXT: ret -; -; MISALIGNED-LABEL: merge_vec_extract_stores: -; MISALIGNED: // %bb.0: -; MISALIGNED-NEXT: stur q0, [x0, #24] -; MISALIGNED-NEXT: ret +; CHECK-LABEL: merge_vec_extract_stores: +; CHECK: // %bb.0: +; CHECK-NEXT: stur q0, [x0, #24] +; CHECK-NEXT: ret %idx0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3 %idx1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 4 @@ -62,9 +55,4 @@ store <2 x float> %shuffle0, <2 x float>* %idx0, align 8 store <2 x float> %shuffle1, <2 x float>* %idx1, align 8 ret void - - -; FIXME: Ideally we would like to use a generic target for this test, but this relies -; on suppressing store pairs. - } Index: llvm/test/CodeGen/ARM/combine-vmovdrr.ll =================================================================== --- llvm/test/CodeGen/ARM/combine-vmovdrr.ll +++ llvm/test/CodeGen/ARM/combine-vmovdrr.ll @@ -9,8 +9,8 @@ ; they are defined on VPRs and used on VPRs. ; ; CHECK-LABEL: motivatingExample: -; CHECK: vldr [[ARG2_VAL:d[0-9]+]], [r1] -; CHECK-NEXT: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0] +; CHECK: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0] +; CHECK-NEXT: vldr [[ARG2_VAL:d[0-9]+]], [r1] ; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALlo]], [[ARG1_VALhi]]}, [[ARG2_VAL]] ; CHECK-NEXT: vstr [[RES]], [r1] ; CHECK-NEXT: bx lr Index: llvm/test/CodeGen/X86/avg-mask.ll =================================================================== --- llvm/test/CodeGen/X86/avg-mask.ll +++ llvm/test/CodeGen/X86/avg-mask.ll @@ -130,9 +130,9 @@ ; AVX512F-NEXT: shrq $32, %rdi ; AVX512F-NEXT: shrq $48, %rax ; AVX512F-NEXT: shrl $16, %ecx -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 -; AVX512F-NEXT: vpavgb %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: kmovw %eax, %k3 @@ -142,14 +142,14 @@ ; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} ; AVX512F-NEXT: vpmovdb %zmm5, %xmm5 ; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm1 -; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} -; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BWVL-LABEL: avg_v64i8_mask: @@ -178,9 +178,9 @@ ; AVX512F-NEXT: shrq $32, %rdi ; AVX512F-NEXT: shrq $48, %rax ; AVX512F-NEXT: shrl $16, %ecx -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpavgb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: kmovw %eax, %k3 @@ -190,14 +190,14 @@ ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BWVL-LABEL: avg_v64i8_maskz: @@ -330,18 +330,18 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: shrl $16, %edi -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 -; AVX512F-NEXT: vpavgw %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: kmovw %edi, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm1 -; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BWVL-LABEL: avg_v32i16_mask: @@ -366,18 +366,18 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: shrl $16, %edi -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpavgw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: kmovw %edi, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 -; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BWVL-LABEL: avg_v32i16_maskz: Index: llvm/test/CodeGen/X86/madd.ll =================================================================== --- llvm/test/CodeGen/X86/madd.ll +++ llvm/test/CodeGen/X86/madd.ll @@ -1975,9 +1975,9 @@ ; ; AVX512F-LABEL: pmaddwd_32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -2188,9 +2188,9 @@ ; ; AVX512F-LABEL: jumbled_indices16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq Index: llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll =================================================================== --- llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -6374,9 +6374,9 @@ ; ; AVX512F-LABEL: truncstore_v32i16_v32i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: vpmovmskb %ymm1, %eax Index: llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll =================================================================== --- llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -725,33 +725,33 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) { ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ;