Index: CodeGen/X86/avx512-cvt.ll =================================================================== --- CodeGen/X86/avx512-cvt.ll +++ CodeGen/X86/avx512-cvt.ll @@ -917,11 +917,9 @@ ; AVX512DQ-NEXT: vxorpd %zmm2, %zmm2, %zmm2 ; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm2, %k0 ; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm2, %k1 -; AVX512DQ-NEXT: vpmovm2q %k1, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 ; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; AVX512DQ-NEXT: vpmovm2q %k0, %zmm1 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: vcvtdq2pd %ymm1, %zmm1 ; AVX512DQ-NEXT: retq %cmpres = fcmp ogt <16 x double> %a, zeroinitializer @@ -960,8 +958,7 @@ ; AVX512DQ: ## BB#0: ; AVX512DQ-NEXT: vxorpd %zmm1, %zmm1, %zmm1 ; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: retq %cmpres = fcmp ogt <8 x double> %a, zeroinitializer @@ -1002,8 +999,7 @@ ; AVX512DQ-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX512DQ-NEXT: retq %cmpres = fcmp ogt <8 x float> %a, zeroinitializer Index: CodeGen/X86/vector-compare-results.ll =================================================================== --- CodeGen/X86/vector-compare-results.ll +++ CodeGen/X86/vector-compare-results.ll @@ -621,9 +621,9 @@ ; ; AVX512BW-LABEL: test_cmp_v8f64: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: retq %1 = fcmp ogt <8 x double> %a0, %a1 ret <8 x i1> %1 @@ -681,9 +681,9 @@ ; ; AVX512BW-LABEL: test_cmp_v16f32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: retq %1 = fcmp ogt <16 x float> %a0, %a1 ret <16 x i1> %1 @@ -792,9 +792,9 @@ ; ; AVX512BW-LABEL: test_cmp_v8i64: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 -; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: retq %1 = icmp sgt <8 x i64> %a0, %a1 ret <8 x i1> %1 @@ -855,9 +855,9 @@ ; ; AVX512BW-LABEL: test_cmp_v16i32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: retq %1 = icmp sgt <16 x i32> %a0, %a1 ret <16 x i1> %1 @@ -1137,10 +1137,9 @@ ; ; AVX512BW-LABEL: test_cmp_v32i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: retq %1 = icmp sgt <32 x i16> %a0, %a1 ret <32 x i1> %1 Index: CodeGen/X86/vector-sext.ll =================================================================== --- CodeGen/X86/vector-sext.ll +++ CodeGen/X86/vector-sext.ll @@ -1946,9 +1946,9 @@ ; AVX512BW-LABEL: load_sext_8i1_to_8i16: ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: movzbl (%rdi), %eax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: kmovd %eax, %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_8i1_to_8i16: @@ -2847,12 +2847,19 @@ ; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_sext_16i1_to_16i8: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: load_sext_16i1_to_16i8: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: kmovw (%rdi), %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_16i1_to_16i8: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_16i1_to_16i8: ; X32-SSE41: # BB#0: # %entry @@ -3384,12 +3391,19 @@ ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_sext_16i1_to_16i16: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: load_sext_16i1_to_16i16: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: kmovw (%rdi), %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_16i1_to_16i16: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_16i1_to_16i16: ; X32-SSE41: # BB#0: # %entry @@ -4228,16 +4242,23 @@ ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_sext_32i1_to_32i8: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: kmovw 2(%rdi), %k2 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: load_sext_32i1_to_32i8: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: kmovw (%rdi), %k1 +; AVX512F-NEXT: kmovw 2(%rdi), %k2 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_32i1_to_32i8: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: kmovd (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_32i1_to_32i8: ; X32-SSE41: # BB#0: # %entry @@ -4968,10 +4989,9 @@ ; ; AVX512BW-LABEL: sext_32xi1_to_32xi8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: retq ; ; X32-SSE41-LABEL: sext_32xi1_to_32xi8: Index: Target/X86/X86ISelLowering.cpp =================================================================== --- Target/X86/X86ISelLowering.cpp +++ Target/X86/X86ISelLowering.cpp @@ -17460,17 +17460,10 @@ // SKX processor if ((InVTElt == MVT::i1) && - (((Subtarget.hasBWI() && Subtarget.hasVLX() && - VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || + (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) || - ((Subtarget.hasBWI() && VT.is512BitVector() && - VTElt.getSizeInBits() <= 16)) || + ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32)))) - ((Subtarget.hasDQI() && Subtarget.hasVLX() && - VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || - - ((Subtarget.hasDQI() && VT.is512BitVector() && - VTElt.getSizeInBits() >= 32)))) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); unsigned NumElts = VT.getVectorNumElements(); @@ -17712,6 +17705,8 @@ unsigned NumElts = VT.getVectorNumElements(); if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) || + (Subtarget.hasBWI() && NumElts >= 32) || + (Subtarget.hasDQI() && NumElts < 16) || NumElts == 16) { // Load and extend - everything is legal if (NumElts < 8) { @@ -17740,7 +17735,7 @@ if (NumElts <= 8) { // A subset, assume that we have only AVX-512F - unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts; + unsigned NumBitsToLoad = 8; MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad); SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(), Ld->getBasePtr(), Index: Target/X86/X86InstrAVX512.td =================================================================== --- Target/X86/X86InstrAVX512.td +++ Target/X86/X86InstrAVX512.td @@ -7884,6 +7884,17 @@ [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX; } +// Use 512bit version to implement 128/256 bit in case NoVLX. +multiclass avx512_convert_mask_to_vector_lowering { + + def : Pat<(X86Info.VT (X86vsext (X86Info.KVT X86Info.KRC:$src))), + (X86Info.VT (EXTRACT_SUBREG + (_.VT (!cast(NAME#"Zrr") + (_.KVT (COPY_TO_REGCLASS X86Info.KRC:$src,_.KRC)))), + X86Info.SubRegIdx))>; +} + multiclass cvt_mask_by_elt_width opc, AVX512VLVectorVTInfo VTInfo, string OpcodeStr, Predicate prd> { let Predicates = [prd] in @@ -7893,20 +7904,17 @@ defm Z256 : cvt_by_vec_width, EVEX_V256; defm Z128 : cvt_by_vec_width, EVEX_V128; } -} +let Predicates = [prd, NoVLX] in { + defm Z256_Alt : avx512_convert_mask_to_vector_lowering; + defm Z128_Alt : avx512_convert_mask_to_vector_lowering; + } -multiclass avx512_convert_mask_to_vector { - defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, OpcodeStr, - HasBWI>; - defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr, - HasBWI>, VEX_W; - defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr, - HasDQI>; - defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr, - HasDQI>, VEX_W; } -defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; +defm VPMOVM2B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, "vpmovm2" , HasBWI>; +defm VPMOVM2W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, "vpmovm2", HasBWI> , VEX_W; +defm VPMOVM2D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, "vpmovm2", HasDQI>; +defm VPMOVM2Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, "vpmovm2", HasDQI> , VEX_W; multiclass convert_vector_to_mask_common opc, X86VectorVTInfo _, string OpcodeStr > { def rr : AVX512XS8I