Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -17747,17 +17747,10 @@ // SKX processor if ((InVTElt == MVT::i1) && - (((Subtarget.hasBWI() && Subtarget.hasVLX() && - VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || + (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) || - ((Subtarget.hasBWI() && VT.is512BitVector() && - VTElt.getSizeInBits() <= 16)) || + ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32)))) - ((Subtarget.hasDQI() && Subtarget.hasVLX() && - VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || - - ((Subtarget.hasDQI() && VT.is512BitVector() && - VTElt.getSizeInBits() >= 32)))) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); unsigned NumElts = VT.getVectorNumElements(); @@ -18002,7 +17995,8 @@ MVT VT = Op.getValueType().getSimpleVT(); unsigned NumElts = VT.getVectorNumElements(); - if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) || + if ((Subtarget.hasBWI() && NumElts >= 32) || + (Subtarget.hasDQI() && NumElts < 16) || NumElts == 16) { // Load and extend - everything is legal if (NumElts < 8) { @@ -18031,7 +18025,7 @@ if (NumElts <= 8) { // A subset, assume that we have only AVX-512F - unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts; + unsigned NumBitsToLoad = 8; MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad); SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(), Ld->getBasePtr(), Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -7953,6 +7953,17 @@ [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX; } +// Use 512bit version to implement 128/256 bit in case NoVLX. +multiclass avx512_convert_mask_to_vector_lowering { + + def : Pat<(X86Info.VT (X86vsext (X86Info.KVT X86Info.KRC:$src))), + (X86Info.VT (EXTRACT_SUBREG + (_.VT (!cast(NAME#"Zrr") + (_.KVT (COPY_TO_REGCLASS X86Info.KRC:$src,_.KRC)))), + X86Info.SubRegIdx))>; +} + multiclass cvt_mask_by_elt_width opc, AVX512VLVectorVTInfo VTInfo, string OpcodeStr, Predicate prd> { let Predicates = [prd] in @@ -7962,20 +7973,17 @@ defm Z256 : cvt_by_vec_width, EVEX_V256; defm Z128 : cvt_by_vec_width, EVEX_V128; } -} +let Predicates = [prd, NoVLX] in { + defm Z256_Alt : avx512_convert_mask_to_vector_lowering; + defm Z128_Alt : avx512_convert_mask_to_vector_lowering; + } -multiclass avx512_convert_mask_to_vector { - defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, OpcodeStr, - HasBWI>; - defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr, - HasBWI>, VEX_W; - defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr, - HasDQI>; - defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr, - HasDQI>, VEX_W; } -defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; +defm VPMOVM2B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, "vpmovm2" , HasBWI>; +defm VPMOVM2W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, "vpmovm2", HasBWI> , VEX_W; +defm VPMOVM2D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, "vpmovm2", HasDQI>; +defm VPMOVM2Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, "vpmovm2", HasDQI> , VEX_W; multiclass convert_vector_to_mask_common opc, X86VectorVTInfo _, string OpcodeStr > { def rr : AVX512XS8I %a, zeroinitializer @@ -1150,8 +1148,7 @@ ; AVX512DQ: ## BB#0: ; AVX512DQ-NEXT: vxorpd %zmm1, %zmm1, %zmm1 ; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: retq %cmpres = fcmp ogt <8 x double> %a, zeroinitializer @@ -1192,8 +1189,7 @@ ; AVX512DQ-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX512DQ-NEXT: retq %cmpres = fcmp ogt <8 x float> %a, zeroinitializer Index: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll +++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL ; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX +; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mattr=+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512BW +; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mattr=+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512DQ + define i16 @mask16(i16 %x) { ; CHECK-LABEL: mask16: @@ -43,6 +46,20 @@ ; SKX-NEXT: knotb %k0, %k0 ; SKX-NEXT: kmovb %k0, %eax ; SKX-NEXT: retq +; +; AVX512BW-LABEL: mask8: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovw %edi, %k0 +; AVX512BW-NEXT: knotw %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: mask8: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovb %edi, %k0 +; AVX512DQ-NEXT: knotb %k0, %k0 +; AVX512DQ-NEXT: kmovb %k0, %eax +; AVX512DQ-NEXT: retq %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, %ret = bitcast <8 x i1> %m1 to i8 @@ -64,6 +81,21 @@ ; SKX-NEXT: knotb %k0, %k0 ; SKX-NEXT: kmovb %k0, %eax ; SKX-NEXT: retq +; +; AVX512BW-LABEL: mask8_zext: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovw %edi, %k0 +; AVX512BW-NEXT: knotw %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: movzbl %al, %eax +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: mask8_zext: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovb %edi, %k0 +; AVX512DQ-NEXT: knotb %k0, %k0 +; AVX512DQ-NEXT: kmovb %k0, %eax +; AVX512DQ-NEXT: retq %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, %m2 = bitcast <8 x i1> %m1 to i8 @@ -102,6 +134,22 @@ ; SKX-NEXT: knotb %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) ; SKX-NEXT: retq +; +; AVX512BW-LABEL: mask8_mem: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k0 +; AVX512BW-NEXT: knotw %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: mask8_mem: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-NEXT: knotb %k0, %k0 +; AVX512DQ-NEXT: kmovb %k0, (%rdi) +; AVX512DQ-NEXT: retq %x = load i8, i8* %ptr, align 4 %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, @@ -161,6 +209,20 @@ ; SKX-NEXT: kshiftrw $8, %k0, %k0 ; SKX-NEXT: kmovb %k0, %eax ; SKX-NEXT: retq +; +; AVX512BW-LABEL: shuf_test1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovw %edi, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: shuf_test1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovw %edi, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 +; AVX512DQ-NEXT: kmovb %k0, %eax +; AVX512DQ-NEXT: retq %v1 = bitcast i16 %v to <16 x i1> %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> %mask1 = bitcast <8 x i1> %mask to i8 @@ -186,6 +248,26 @@ ; SKX-NEXT: andl $1, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; +; AVX512BW-LABEL: zext_test1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: zext_test1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i32 @@ -213,6 +295,28 @@ ; SKX-NEXT: ## kill: %AX %AX %EAX ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; +; AVX512BW-LABEL: zext_test2: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: ## kill: %AX %AX %EAX +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: zext_test2: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: ## kill: %AX %AX %EAX +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i16 @@ -240,6 +344,28 @@ ; SKX-NEXT: ## kill: %AL %AL %EAX ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; +; AVX512BW-LABEL: zext_test3: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: ## kill: %AL %AL %EAX +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: zext_test3: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: ## kill: %AL %AL %EAX +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i8 @@ -263,6 +389,23 @@ ; SKX-NEXT: movb $-2, -{{[0-9]+}}(%rsp) ; SKX-NEXT: movb $-2, %al ; SKX-NEXT: retq +; +; AVX512BW-LABEL: conv1: +; AVX512BW: ## BB#0: ## %entry +; AVX512BW-NEXT: kxnorw %k0, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: movb $-2, -{{[0-9]+}}(%rsp) +; AVX512BW-NEXT: movb $-2, %al +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: conv1: +; AVX512DQ: ## BB#0: ## %entry +; AVX512DQ-NEXT: kxnorw %k0, %k0, %k0 +; AVX512DQ-NEXT: kmovb %k0, (%rdi) +; AVX512DQ-NEXT: movb $-2, -{{[0-9]+}}(%rsp) +; AVX512DQ-NEXT: movb $-2, %al +; AVX512DQ-NEXT: retq entry: store <8 x i1> , <8 x i1>* %R @@ -291,6 +434,26 @@ ; SKX-NEXT: vpmovm2d %k0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test4: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm1 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test4: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512DQ-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %x_gt_y = icmp sgt <4 x i64> %x, %y %x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1 %res = icmp sgt <4 x i1>%x_gt_y, %x1_gt_y1 @@ -313,6 +476,20 @@ ; SKX-NEXT: kandnw %k1, %k0, %k0 ; SKX-NEXT: vpmovm2q %k0, %xmm0 ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test5: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 +; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test5: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX512DQ-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 +; AVX512DQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX512DQ-NEXT: retq %x_gt_y = icmp slt <2 x i64> %x, %y %x1_gt_y1 = icmp sgt <2 x i64> %x1, %y1 %res = icmp slt <2 x i1>%x_gt_y, %x1_gt_y1 @@ -353,6 +530,30 @@ ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: ktestb %k0, %k0 ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test7: +; AVX512BW: ## BB#0: ## %allocas +; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: movb $85, %al +; AVX512BW-NEXT: kmovw %eax, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: testb %al, %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test7: +; AVX512DQ: ## BB#0: ## %allocas +; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: movb $85, %al +; AVX512DQ-NEXT: kmovb %eax, %k1 +; AVX512DQ-NEXT: korb %k1, %k0, %k0 +; AVX512DQ-NEXT: ktestb %k0, %k0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq allocas: %a= or <8 x i1> %mask, %b = bitcast <8 x i1> %a to i8 @@ -396,6 +597,38 @@ ; SKX-NEXT: vpmovm2b %k0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test8: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; AVX512BW-NEXT: cmpl %esi, %edi +; AVX512BW-NEXT: jg LBB17_1 +; AVX512BW-NEXT: ## BB#2: +; AVX512BW-NEXT: vpcmpltud %zmm2, %zmm1, %k0 +; AVX512BW-NEXT: jmp LBB17_3 +; AVX512BW-NEXT: LBB17_1: +; AVX512BW-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 +; AVX512BW-NEXT: LBB17_3: +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test8: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; AVX512DQ-NEXT: cmpl %esi, %edi +; AVX512DQ-NEXT: jg LBB17_1 +; AVX512DQ-NEXT: ## BB#2: +; AVX512DQ-NEXT: vpcmpltud %zmm2, %zmm1, %k0 +; AVX512DQ-NEXT: jmp LBB17_3 +; AVX512DQ-NEXT: LBB17_1: +; AVX512DQ-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 +; AVX512DQ-NEXT: LBB17_3: +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %cond = icmp sgt i32 %a1, %b1 %cmp1 = icmp sgt <16 x i32> %a, zeroinitializer %cmp2 = icmp ult <16 x i32> %b, zeroinitializer @@ -433,6 +666,39 @@ ; SKX-NEXT: vpmovb2m %xmm0, %k0 ; SKX-NEXT: vpmovm2b %k0, %xmm0 ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test9: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: cmpl %esi, %edi +; AVX512BW-NEXT: jg LBB18_1 +; AVX512BW-NEXT: ## BB#2: +; AVX512BW-NEXT: vpsllw $7, %xmm1, %xmm0 +; AVX512BW-NEXT: jmp LBB18_3 +; AVX512BW-NEXT: LBB18_1: +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: LBB18_3: +; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test9: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: cmpl %esi, %edi +; AVX512DQ-NEXT: jg LBB18_1 +; AVX512DQ-NEXT: ## BB#2: +; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm0 +; AVX512DQ-NEXT: jmp LBB18_3 +; AVX512DQ-NEXT: LBB18_1: +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: LBB18_3: +; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %mask = icmp sgt i32 %a1, %b1 %c = select i1 %mask, <16 x i1>%a, <16 x i1>%b ret <16 x i1>%c @@ -465,6 +731,24 @@ ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 ; SKX-NEXT: vpmovm2d %k0, %xmm0 ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test11: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: cmpl %esi, %edi +; AVX512BW-NEXT: jg LBB20_2 +; AVX512BW-NEXT: ## BB#1: +; AVX512BW-NEXT: vmovaps %xmm1, %xmm0 +; AVX512BW-NEXT: LBB20_2: +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test11: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: cmpl %esi, %edi +; AVX512DQ-NEXT: jg LBB20_2 +; AVX512DQ-NEXT: ## BB#1: +; AVX512DQ-NEXT: vmovaps %xmm1, %xmm0 +; AVX512DQ-NEXT: LBB20_2: +; AVX512DQ-NEXT: retq %mask = icmp sgt i32 %a1, %b1 %c = select i1 %mask, <4 x i1>%a, <4 x i1>%b ret <4 x i1>%c @@ -518,6 +802,30 @@ ; SKX-NEXT: kmovw %ecx, %k0 ; SKX-NEXT: vpmovm2b %k0, %xmm0 ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test15: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: cmpl %esi, %edi +; AVX512BW-NEXT: movw $21845, %ax ## imm = 0x5555 +; AVX512BW-NEXT: movw $1, %cx +; AVX512BW-NEXT: cmovgw %ax, %cx +; AVX512BW-NEXT: kmovw %ecx, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test15: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: cmpl %esi, %edi +; AVX512DQ-NEXT: movw $21845, %ax ## imm = 0x5555 +; AVX512DQ-NEXT: movw $1, %cx +; AVX512DQ-NEXT: cmovgw %ax, %cx +; AVX512DQ-NEXT: kmovw %ecx, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %a = bitcast i16 21845 to <16 x i1> %b = bitcast i16 1 to <16 x i1> %mask = icmp sgt i32 %x, %y @@ -581,6 +889,60 @@ ; SKX-NEXT: vpmovb2m %zmm0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test16: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k0 +; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $15, %k1, %k1 +; AVX512BW-NEXT: vpmovm2b %k1, %zmm0 +; AVX512BW-NEXT: vpsllq $40, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test16: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: Lcfi0: +; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 +; AVX512DQ-NEXT: Lcfi1: +; AVX512DQ-NEXT: .cfi_offset %rbp, -16 +; AVX512DQ-NEXT: movq %rsp, %rbp +; AVX512DQ-NEXT: Lcfi2: +; AVX512DQ-NEXT: .cfi_def_cfa_register %rbp +; AVX512DQ-NEXT: andq $-32, %rsp +; AVX512DQ-NEXT: subq $64, %rsp +; AVX512DQ-NEXT: movl %edi, (%rsp) +; AVX512DQ-NEXT: shrq $32, %rdi +; AVX512DQ-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; AVX512DQ-NEXT: kmovw (%rsp), %k0 +; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQ-NEXT: movl $1, %eax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 +; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 +; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 +; AVX512DQ-NEXT: movq %rbp, %rsp +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: retq %a = bitcast i64 %x to <64 x i1> %b = insertelement <64 x i1>%a, i1 true, i32 5 %c = sext <64 x i1>%b to <64 x i8> @@ -647,6 +1009,64 @@ ; SKX-NEXT: vpmovb2m %zmm0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test17: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k0 +; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: setg %al +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: kmovw %eax, %k1 +; AVX512BW-NEXT: vpmovm2b %k1, %zmm0 +; AVX512BW-NEXT: vpsllq $40, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test17: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: Lcfi3: +; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 +; AVX512DQ-NEXT: Lcfi4: +; AVX512DQ-NEXT: .cfi_offset %rbp, -16 +; AVX512DQ-NEXT: movq %rsp, %rbp +; AVX512DQ-NEXT: Lcfi5: +; AVX512DQ-NEXT: .cfi_def_cfa_register %rbp +; AVX512DQ-NEXT: andq $-32, %rsp +; AVX512DQ-NEXT: subq $64, %rsp +; AVX512DQ-NEXT: movl %edi, (%rsp) +; AVX512DQ-NEXT: shrq $32, %rdi +; AVX512DQ-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; AVX512DQ-NEXT: kmovw (%rsp), %k0 +; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQ-NEXT: xorl %eax, %eax +; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: setg %al +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 +; AVX512DQ-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 +; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 +; AVX512DQ-NEXT: movq %rbp, %rsp +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: retq %a = bitcast i64 %x to <64 x i1> %b = icmp sgt i32 %y, %z %c = insertelement <64 x i1>%a, i1 %b, i32 5 @@ -697,6 +1117,51 @@ ; SKX-NEXT: vpmovm2w %k0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test18: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovw %edi, %k1 +; AVX512BW-NEXT: kmovw %esi, %k2 +; AVX512BW-NEXT: kshiftlw $7, %k2, %k0 +; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $15, %k2, %k2 +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsllq $63, %zmm2, %zmm0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test18: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovb %edi, %k0 +; AVX512DQ-NEXT: kmovw %esi, %k1 +; AVX512DQ-NEXT: kshiftlw $7, %k1, %k2 +; AVX512DQ-NEXT: kshiftrw $15, %k2, %k2 +; AVX512DQ-NEXT: kshiftlw $6, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 +; AVX512DQ-NEXT: vpmovm2q %k1, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpmovq2m %zmm2, %k0 +; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $7, %k2, %k1 +; AVX512DQ-NEXT: korb %k1, %k0, %k0 +; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 +; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> %b1 = bitcast i16 %y to <16 x i1> %el1 = extractelement <16 x i1>%b1, i32 8 @@ -725,6 +1190,26 @@ ; SKX-NEXT: vpmovb2m %ymm1, %k1 ; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test21: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $7, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test21: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512DQ-NEXT: vpsllw $15, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpsraw $15, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512DQ-NEXT: vpsllw $15, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsraw $15, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: retq %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } @@ -745,6 +1230,25 @@ ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test22: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test22: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: kmovb %k0, (%rdi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq store <4 x i1> %a, <4 x i1>* %addr ret void } @@ -765,6 +1269,25 @@ ; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test23: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test23: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: kmovb %k0, (%rdi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq store <2 x i1> %a, <2 x i1>* %addr ret void } @@ -790,6 +1313,27 @@ ; SKX-NEXT: kxorw %k1, %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rsi) ; SKX-NEXT: retq +; +; AVX512BW-LABEL: store_v1i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: andl $1, %edi +; AVX512BW-NEXT: kmovw %edi, %k0 +; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $15, %k1, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: movb %al, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: store_v1i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: andl $1, %edi +; AVX512DQ-NEXT: kmovw %edi, %k0 +; AVX512DQ-NEXT: kxnorw %k0, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kmovb %k0, (%rsi) +; AVX512DQ-NEXT: retq %x = xor <1 x i1> %c, store <1 x i1> %x, <1 x i1>* %ptr, align 4 ret void @@ -812,6 +1356,25 @@ ; SKX-NEXT: knotw %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) ; SKX-NEXT: retq +; +; AVX512BW-LABEL: store_v2i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: store_v2i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: kmovb %k0, (%rdi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %x = xor <2 x i1> %c, store <2 x i1> %x, <2 x i1>* %ptr, align 4 ret void @@ -835,6 +1398,27 @@ ; SKX-NEXT: knotw %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) ; SKX-NEXT: retq +; +; AVX512BW-LABEL: store_v4i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512BW-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: store_v4i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: kmovb %k0, (%rdi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %x = xor <4 x i1> %c, store <4 x i1> %x, <4 x i1>* %ptr, align 4 ret void @@ -858,6 +1442,26 @@ ; SKX-NEXT: knotb %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) ; SKX-NEXT: retq +; +; AVX512BW-LABEL: store_v8i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: knotw %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: store_v8i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: knotb %k0, %k0 +; AVX512DQ-NEXT: kmovb %k0, (%rdi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %x = xor <8 x i1> %c, store <8 x i1> %x, <8 x i1>* %ptr, align 4 ret void @@ -880,6 +1484,25 @@ ; SKX-NEXT: knotw %k0, %k0 ; SKX-NEXT: kmovw %k0, (%rdi) ; SKX-NEXT: retq +; +; AVX512BW-LABEL: store_v16i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 +; AVX512BW-NEXT: knotw %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: store_v16i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: knotw %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, (%rdi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %x = xor <16 x i1> %c, store <16 x i1> %x, <16 x i1>* %ptr, align 4 ret void @@ -925,6 +1548,33 @@ ; SKX-NEXT: kmovb %k0, {{.*}}(%rip) ; SKX-NEXT: xorl $1, %edi ; SKX-NEXT: jmp _f2 ## TAILCALL +; +; AVX512BW-LABEL: f1: +; AVX512BW: ## BB#0: ## %entry +; AVX512BW-NEXT: movzbl {{.*}}(%rip), %edi +; AVX512BW-NEXT: movl %edi, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: kmovw %eax, %k0 +; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $15, %k1, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: movb %al, {{.*}}(%rip) +; AVX512BW-NEXT: xorl $1, %edi +; AVX512BW-NEXT: jmp _f2 ## TAILCALL +; +; AVX512DQ-LABEL: f1: +; AVX512DQ: ## BB#0: ## %entry +; AVX512DQ-NEXT: movzbl {{.*}}(%rip), %edi +; AVX512DQ-NEXT: movl %edi, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: kmovw %eax, %k0 +; AVX512DQ-NEXT: kxnorw %k0, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 +; AVX512DQ-NEXT: kmovb %k0, {{.*}}(%rip) +; AVX512DQ-NEXT: xorl $1, %edi +; AVX512DQ-NEXT: jmp _f2 ## TAILCALL entry: %.b1 = load i1, i1* @f1.v, align 4 %not..b1 = xor i1 %.b1, true @@ -971,6 +1621,19 @@ ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test_build_vec_v32i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: movl $1497715861, %eax ## imm = 0x59455495 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test_build_vec_v32i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: retq %ret = select <32 x i1> , <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } @@ -988,6 +1651,19 @@ ; SKX-NEXT: kmovq %rax, %k1 ; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test_build_vec_v64i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: movabsq $6432645796886517060, %rax ## imm = 0x5945594549549544 +; AVX512BW-NEXT: kmovq %rax, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test_build_vec_v64i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: retq %ret = select <64 x i1> , <64 x i8> %x, <64 x i8> zeroinitializer ret <64 x i8> %ret } @@ -1025,6 +1701,41 @@ ; SKX-NEXT: vmovapd %zmm0, 8(%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; +; AVX512BW-LABEL: ktest_1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vmovupd (%rdi), %zmm1 +; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1 +; AVX512BW-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} +; AVX512BW-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: testb %al, %al +; AVX512BW-NEXT: je LBB41_2 +; AVX512BW-NEXT: ## BB#1: ## %L1 +; AVX512BW-NEXT: vmovapd %zmm0, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; AVX512BW-NEXT: LBB41_2: ## %L2 +; AVX512BW-NEXT: vmovapd %zmm0, 8(%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: ktest_1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vmovupd (%rdi), %zmm1 +; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 +; AVX512DQ-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} +; AVX512DQ-NEXT: ktestb %k0, %k0 +; AVX512DQ-NEXT: je LBB41_2 +; AVX512DQ-NEXT: ## BB#1: ## %L1 +; AVX512DQ-NEXT: vmovapd %zmm0, (%rdi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; AVX512DQ-NEXT: LBB41_2: ## %L2 +; AVX512DQ-NEXT: vmovapd %zmm0, 8(%rdi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %addr1 = getelementptr double, double * %base, i64 0 %addr2 = getelementptr double, double * %base, i64 1 @@ -1377,6 +2088,331 @@ ; SKX-NEXT: vmovaps %zmm1, 68(%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; +; AVX512BW-LABEL: ktest_2: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vmovups (%rdi), %zmm2 +; AVX512BW-NEXT: vmovups 64(%rdi), %zmm3 +; AVX512BW-NEXT: vcmpltps %zmm0, %zmm2, %k1 +; AVX512BW-NEXT: vcmpltps %zmm1, %zmm3, %k2 +; AVX512BW-NEXT: kunpckwd %k1, %k2, %k0 +; AVX512BW-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} +; AVX512BW-NEXT: vcmpltps %zmm3, %zmm0, %k1 +; AVX512BW-NEXT: vcmpltps %zmm2, %zmm1, %k2 +; AVX512BW-NEXT: kunpckwd %k1, %k2, %k1 +; AVX512BW-NEXT: kord %k1, %k0, %k0 +; AVX512BW-NEXT: ktestd %k0, %k0 +; AVX512BW-NEXT: je LBB42_2 +; AVX512BW-NEXT: ## BB#1: ## %L1 +; AVX512BW-NEXT: vmovaps %zmm0, (%rdi) +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; AVX512BW-NEXT: LBB42_2: ## %L2 +; AVX512BW-NEXT: vmovaps %zmm0, 4(%rdi) +; AVX512BW-NEXT: vmovaps %zmm1, 68(%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: ktest_2: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: Lcfi6: +; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 +; AVX512DQ-NEXT: Lcfi7: +; AVX512DQ-NEXT: .cfi_offset %rbp, -16 +; AVX512DQ-NEXT: movq %rsp, %rbp +; AVX512DQ-NEXT: Lcfi8: +; AVX512DQ-NEXT: .cfi_def_cfa_register %rbp +; AVX512DQ-NEXT: andq $-32, %rsp +; AVX512DQ-NEXT: subq $32, %rsp +; AVX512DQ-NEXT: vmovups (%rdi), %zmm2 +; AVX512DQ-NEXT: vmovups 64(%rdi), %zmm3 +; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k1 +; AVX512DQ-NEXT: kshiftlw $14, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm3 +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $13, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $12, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $11, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $10, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $9, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $8, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $7, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $6, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $5, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $4, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $3, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $2, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $1, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k2 +; AVX512DQ-NEXT: kshiftlw $14, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm2 +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $13, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $12, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $11, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $10, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $9, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $8, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $5, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $4, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $3, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $2, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: kshiftrw $15, %k2, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovups 4(%rdi), %zmm3 {%k2} {z} +; AVX512DQ-NEXT: vmovups 68(%rdi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vcmpltps %zmm4, %zmm1, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm4 +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; AVX512DQ-NEXT: vcmpltps %zmm3, %zmm0, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm3 +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3 +; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512DQ-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512DQ-NEXT: kmovw %k0, (%rsp) +; AVX512DQ-NEXT: cmpl $0, (%rsp) +; AVX512DQ-NEXT: je LBB42_2 +; AVX512DQ-NEXT: ## BB#1: ## %L1 +; AVX512DQ-NEXT: vmovaps %zmm0, (%rdi) +; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rdi) +; AVX512DQ-NEXT: jmp LBB42_3 +; AVX512DQ-NEXT: LBB42_2: ## %L2 +; AVX512DQ-NEXT: vmovaps %zmm0, 4(%rdi) +; AVX512DQ-NEXT: vmovaps %zmm1, 68(%rdi) +; AVX512DQ-NEXT: LBB42_3: ## %End +; AVX512DQ-NEXT: movq %rbp, %rsp +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %addr1 = getelementptr float, float * %base, i64 0 %addr2 = getelementptr float, float * %base, i64 1 @@ -1417,6 +2453,19 @@ ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: vpmovm2q %k0, %zmm0 ; SKX-NEXT: retq +; +; AVX512BW-LABEL: load_8i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: load_8i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 +; AVX512DQ-NEXT: retq %b = load <8 x i1>, <8 x i1>* %a %c = sext <8 x i1> %b to <8 x i64> ret <8 x i64> %c @@ -1434,6 +2483,18 @@ ; SKX-NEXT: kmovw (%rdi), %k0 ; SKX-NEXT: vpmovm2d %k0, %zmm0 ; SKX-NEXT: retq +; +; AVX512BW-LABEL: load_16i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovw (%rdi), %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: load_16i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: retq %b = load <16 x i1>, <16 x i1>* %a %c = sext <16 x i1> %b to <16 x i32> ret <16 x i32> %c @@ -1453,6 +2514,23 @@ ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: vpmovm2q %k0, %xmm0 ; SKX-NEXT: retq +; +; AVX512BW-LABEL: load_2i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: load_2i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 +; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %b = load <2 x i1>, <2 x i1>* %a %c = sext <2 x i1> %b to <2 x i16> ret <2 x i16> %c @@ -1473,6 +2551,24 @@ ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: vpmovm2d %k0, %xmm0 ; SKX-NEXT: retq +; +; AVX512BW-LABEL: load_4i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: load_4i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovb (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %b = load <4 x i1>, <4 x i1>* %a %c = sext <4 x i1> %b to <4 x i16> ret <4 x i16> %c @@ -1494,6 +2590,22 @@ ; SKX-NEXT: kmovd (%rdi), %k0 ; SKX-NEXT: vpmovm2w %k0, %zmm0 ; SKX-NEXT: retq +; +; AVX512BW-LABEL: load_32i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: load_32i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 +; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512DQ-NEXT: retq %b = load <32 x i1>, <32 x i1>* %a %c = sext <32 x i1> %b to <32 x i16> ret <32 x i16> %c @@ -1523,6 +2635,30 @@ ; SKX-NEXT: kmovq (%rdi), %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: retq +; +; AVX512BW-LABEL: load_64i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: load_64i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 +; AVX512DQ-NEXT: kmovw 4(%rdi), %k2 +; AVX512DQ-NEXT: kmovw 6(%rdi), %k3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 +; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1 +; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQ-NEXT: vpmovm2d %k3, %zmm2 +; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: retq %b = load <64 x i1>, <64 x i1>* %a %c = sext <64 x i1> %b to <64 x i8> ret <64 x i8> %c @@ -1544,6 +2680,24 @@ ; SKX-NEXT: vpmovw2m %xmm0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) ; SKX-NEXT: retq +; +; AVX512BW-LABEL: store_8i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: store_8i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: kmovb %k0, (%rdi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq store <8 x i1> %v, <8 x i1>* %a ret void } @@ -1564,6 +2718,24 @@ ; SKX-NEXT: vpmovw2m %xmm0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) ; SKX-NEXT: retq +; +; AVX512BW-LABEL: store_8i1_1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: store_8i1_1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: kmovb %k0, (%rdi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %v1 = trunc <8 x i16> %v to <8 x i1> store <8 x i1> %v1, <8 x i1>* %a ret void @@ -1584,6 +2756,23 @@ ; SKX-NEXT: vpmovb2m %xmm0, %k0 ; SKX-NEXT: kmovw %k0, (%rdi) ; SKX-NEXT: retq +; +; AVX512BW-LABEL: store_16i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 +; AVX512BW-NEXT: kmovw %k0, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: store_16i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, (%rdi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq store <16 x i1> %v, <16 x i1>* %a ret void } @@ -1609,6 +2798,28 @@ ; SKX-NEXT: kmovd %k0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; +; AVX512BW-LABEL: store_32i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: store_32i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512DQ-NEXT: kmovw %k0, 2(%rdi) +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, (%rdi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq store <32 x i1> %v, <32 x i1>* %a ret void } @@ -1637,6 +2848,31 @@ ; SKX-NEXT: kmovd %k0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; +; AVX512BW-LABEL: store_32i1_1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $15, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: store_32i1_1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512DQ-NEXT: kmovw %k0, 2(%rdi) +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, (%rdi) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %v1 = trunc <32 x i16> %v to <32 x i1> store <32 x i1> %v1, <32 x i1>* %a ret void @@ -1972,6 +3208,335 @@ ; SKX-NEXT: kmovq %k0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; +; AVX512BW-LABEL: store_64i1: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: store_64i1: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: Lcfi9: +; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 +; AVX512DQ-NEXT: pushq %r15 +; AVX512DQ-NEXT: Lcfi10: +; AVX512DQ-NEXT: .cfi_def_cfa_offset 24 +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: Lcfi11: +; AVX512DQ-NEXT: .cfi_def_cfa_offset 32 +; AVX512DQ-NEXT: pushq %r13 +; AVX512DQ-NEXT: Lcfi12: +; AVX512DQ-NEXT: .cfi_def_cfa_offset 40 +; AVX512DQ-NEXT: pushq %r12 +; AVX512DQ-NEXT: Lcfi13: +; AVX512DQ-NEXT: .cfi_def_cfa_offset 48 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: Lcfi14: +; AVX512DQ-NEXT: .cfi_def_cfa_offset 56 +; AVX512DQ-NEXT: Lcfi15: +; AVX512DQ-NEXT: .cfi_offset %rbx, -56 +; AVX512DQ-NEXT: Lcfi16: +; AVX512DQ-NEXT: .cfi_offset %r12, -48 +; AVX512DQ-NEXT: Lcfi17: +; AVX512DQ-NEXT: .cfi_offset %r13, -40 +; AVX512DQ-NEXT: Lcfi18: +; AVX512DQ-NEXT: .cfi_offset %r14, -32 +; AVX512DQ-NEXT: Lcfi19: +; AVX512DQ-NEXT: .cfi_offset %r15, -24 +; AVX512DQ-NEXT: Lcfi20: +; AVX512DQ-NEXT: .cfi_offset %rbp, -16 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3 +; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r8d +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r9d +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r10d +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r11d +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r14d +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r15d +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r12d +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r13d +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ebx +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ebp +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ecx +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %edx +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %esi +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: vmovd %r9d, %xmm3 +; AVX512DQ-NEXT: kmovw %k1, %r9d +; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k2 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: vpinsrb $1, %r8d, %xmm3, %xmm2 +; AVX512DQ-NEXT: vpinsrb $2, %r10d, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpinsrb $4, %r14d, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpinsrb $5, %r15d, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpinsrb $6, %r12d, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpinsrb $7, %r13d, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpinsrb $8, %ebx, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpinsrb $9, %ebp, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpinsrb $13, %esi, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpinsrb $14, %r9d, %xmm2, %xmm2 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512DQ-NEXT: kmovw %k0, 6(%rdi) +; AVX512DQ-NEXT: kshiftlw $14, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r8d +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r10d +; AVX512DQ-NEXT: kshiftlw $13, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r9d +; AVX512DQ-NEXT: kshiftlw $12, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r11d +; AVX512DQ-NEXT: kshiftlw $11, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r14d +; AVX512DQ-NEXT: kshiftlw $10, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r15d +; AVX512DQ-NEXT: kshiftlw $9, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r12d +; AVX512DQ-NEXT: kshiftlw $8, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r13d +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %ecx +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %esi +; AVX512DQ-NEXT: kshiftlw $5, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %ebp +; AVX512DQ-NEXT: kshiftlw $4, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %ebx +; AVX512DQ-NEXT: kshiftlw $3, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: kshiftlw $2, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %edx +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: vmovd %r10d, %xmm2 +; AVX512DQ-NEXT: kmovw %k0, %r10d +; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k2, %k0 +; AVX512DQ-NEXT: vpinsrb $1, %r8d, %xmm2, %xmm1 +; AVX512DQ-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $3, %r11d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $4, %r14d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $5, %r15d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $6, %r12d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $7, %r13d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $10, %ebp, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $11, %ebx, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $13, %edx, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $14, %r10d, %xmm1, %xmm1 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512DQ-NEXT: kmovw %k0, 4(%rdi) +; AVX512DQ-NEXT: kshiftlw $14, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r8d +; AVX512DQ-NEXT: kshiftlw $15, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r10d +; AVX512DQ-NEXT: kshiftlw $13, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r9d +; AVX512DQ-NEXT: kshiftlw $12, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r11d +; AVX512DQ-NEXT: kshiftlw $11, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r14d +; AVX512DQ-NEXT: kshiftlw $10, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r15d +; AVX512DQ-NEXT: kshiftlw $9, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r12d +; AVX512DQ-NEXT: kshiftlw $8, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %r13d +; AVX512DQ-NEXT: kshiftlw $7, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %ecx +; AVX512DQ-NEXT: kshiftlw $6, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %esi +; AVX512DQ-NEXT: kshiftlw $5, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %ebp +; AVX512DQ-NEXT: kshiftlw $4, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %ebx +; AVX512DQ-NEXT: kshiftlw $3, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: kshiftlw $2, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %edx +; AVX512DQ-NEXT: kshiftlw $1, %k1, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: vmovd %r10d, %xmm1 +; AVX512DQ-NEXT: kmovw %k0, %r10d +; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm0 +; AVX512DQ-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $14, %r10d, %xmm0, %xmm0 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512DQ-NEXT: kmovw %k1, 2(%rdi) +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r8d +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r9d +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r10d +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r11d +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r14d +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r15d +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r12d +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %r13d +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %edx +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %esi +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ebp +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ebx +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %eax +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: kmovw %k1, %ecx +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1 +; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 +; AVX512DQ-NEXT: vmovd %r9d, %xmm0 +; AVX512DQ-NEXT: kmovw %k1, %r9d +; AVX512DQ-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, (%rdi) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r12 +; AVX512DQ-NEXT: popq %r13 +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %r15 +; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq store <64 x i1> %v, <64 x i1>* %a ret void } @@ -1994,6 +3559,25 @@ ; SKX-NEXT: addl %eax, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test_bitcast_v8i1_zext: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: movzbl %al, %eax +; AVX512BW-NEXT: addl %eax, %eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test_bitcast_v8i1_zext: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512DQ-NEXT: kmovb %k0, %eax +; AVX512DQ-NEXT: addl %eax, %eax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %v1 = icmp eq <16 x i32> %a, zeroinitializer %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> %mask1 = bitcast <8 x i1> %mask to i8 @@ -2019,6 +3603,24 @@ ; SKX-NEXT: addl %eax, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test_bitcast_v16i1_zext: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: addl %eax, %eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test_bitcast_v16i1_zext: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %eax +; AVX512DQ-NEXT: addl %eax, %eax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq %v1 = icmp eq <16 x i32> %a, zeroinitializer %mask1 = bitcast <16 x i1> %v1 to i16 %val = zext i16 %mask1 to i32 @@ -2087,6 +3689,22 @@ ; SKX-NEXT: kxorb %k1, %k0, %k0 ; SKX-NEXT: kmovb %k0, %eax ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test_v8i1_add: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovw %edi, %k0 +; AVX512BW-NEXT: kmovw %esi, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test_v8i1_add: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovb %edi, %k0 +; AVX512DQ-NEXT: kmovb %esi, %k1 +; AVX512DQ-NEXT: kxorb %k1, %k0, %k0 +; AVX512DQ-NEXT: kmovb %k0, %eax +; AVX512DQ-NEXT: retq %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> %m2 = add <8 x i1> %m0, %m1 @@ -2110,6 +3728,22 @@ ; SKX-NEXT: kxorb %k1, %k0, %k0 ; SKX-NEXT: kmovb %k0, %eax ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test_v8i1_sub: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovw %edi, %k0 +; AVX512BW-NEXT: kmovw %esi, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test_v8i1_sub: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovb %edi, %k0 +; AVX512DQ-NEXT: kmovb %esi, %k1 +; AVX512DQ-NEXT: kxorb %k1, %k0, %k0 +; AVX512DQ-NEXT: kmovb %k0, %eax +; AVX512DQ-NEXT: retq %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> %m2 = sub <8 x i1> %m0, %m1 @@ -2133,6 +3767,22 @@ ; SKX-NEXT: kandb %k1, %k0, %k0 ; SKX-NEXT: kmovb %k0, %eax ; SKX-NEXT: retq +; +; AVX512BW-LABEL: test_v8i1_mul: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovw %edi, %k0 +; AVX512BW-NEXT: kmovw %esi, %k1 +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test_v8i1_mul: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: kmovb %edi, %k0 +; AVX512DQ-NEXT: kmovb %esi, %k1 +; AVX512DQ-NEXT: kandb %k1, %k0, %k0 +; AVX512DQ-NEXT: kmovb %k0, %eax +; AVX512DQ-NEXT: retq %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> %m2 = mul <8 x i1> %m0, %m1 Index: llvm/trunk/test/CodeGen/X86/vector-compare-results.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-compare-results.ll +++ llvm/trunk/test/CodeGen/X86/vector-compare-results.ll @@ -630,9 +630,9 @@ ; ; AVX512BW-LABEL: test_cmp_v8f64: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = fcmp ogt <8 x double> %a0, %a1 @@ -693,9 +693,9 @@ ; ; AVX512BW-LABEL: test_cmp_v16f32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = fcmp ogt <16 x float> %a0, %a1 @@ -807,9 +807,9 @@ ; ; AVX512BW-LABEL: test_cmp_v8i64: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 -; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = icmp sgt <8 x i64> %a0, %a1 @@ -873,9 +873,9 @@ ; ; AVX512BW-LABEL: test_cmp_v16i32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = icmp sgt <16 x i32> %a0, %a1 @@ -1149,10 +1149,9 @@ ; ; AVX512BW-LABEL: test_cmp_v32i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: retq %1 = icmp sgt <32 x i16> %a0, %a1 ret <32 x i1> %1 Index: llvm/trunk/test/CodeGen/X86/vector-sext.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-sext.ll +++ llvm/trunk/test/CodeGen/X86/vector-sext.ll @@ -1951,9 +1951,9 @@ ; AVX512BW-LABEL: load_sext_8i1_to_8i16: ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: movzbl (%rdi), %eax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: kmovd %eax, %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2853,13 +2853,21 @@ ; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_sext_16i1_to_16i8: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: load_sext_16i1_to_16i8: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: kmovw (%rdi), %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_16i1_to_16i8: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_16i1_to_16i8: ; X32-SSE41: # BB#0: # %entry @@ -3391,12 +3399,19 @@ ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_sext_16i1_to_16i16: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: load_sext_16i1_to_16i16: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: kmovw (%rdi), %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_16i1_to_16i16: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_16i1_to_16i16: ; X32-SSE41: # BB#0: # %entry @@ -4235,16 +4250,23 @@ ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_sext_32i1_to_32i8: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: kmovw 2(%rdi), %k2 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: load_sext_32i1_to_32i8: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: kmovw (%rdi), %k1 +; AVX512F-NEXT: kmovw 2(%rdi), %k2 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_32i1_to_32i8: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: kmovd (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_32i1_to_32i8: ; X32-SSE41: # BB#0: # %entry @@ -4975,10 +4997,9 @@ ; ; AVX512BW-LABEL: sext_32xi1_to_32xi8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: retq ; ; X32-SSE41-LABEL: sext_32xi1_to_32xi8: