Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1054,9 +1054,20 @@ if (CustomLowerNode(N, N->getValueType(0), true)) return; - // Spill the vector to the stack. + // Make the vector elements byte-addressable if they aren't already. EVT VecVT = Vec.getValueType(); EVT EltVT = VecVT.getVectorElementType(); + if (VecVT.getScalarSizeInBits() < 8) { + EltVT = MVT::i8; + VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, + VecVT.getVectorNumElements()); + Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec); + // Extend the element type to match if needed. + if (EltVT.bitsGT(Elt.getValueType())) + Elt = DAG.getNode(ISD::ANY_EXTEND, dl, EltVT, Elt); + } + + // Spill the vector to the stack. SDValue StackPtr = DAG.CreateStackTemporary(VecVT); auto &MF = DAG.getMachineFunction(); auto FrameIndex = cast(StackPtr.getNode())->getIndex(); @@ -1071,19 +1082,29 @@ Store = DAG.getTruncStore(Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT); + // Load the Lo part from the stack slot. - Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo); + Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo); // Increment the pointer to the other part. - unsigned IncrementSize = Lo.getValueSizeInBits() / 8; + unsigned IncrementSize = LoVT.getSizeInBits() / 8; StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr, DAG.getConstant(IncrementSize, dl, StackPtr.getValueType())); // Load the Hi part from the stack slot. - Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, + Hi = DAG.getLoad(HiVT, dl, Store, StackPtr, PtrInfo.getWithOffset(IncrementSize), MinAlign(Alignment, IncrementSize)); + + // If we adjusted the original type, we need to truncate the results. + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + if (LoVT != Lo.getValueType()) + Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Lo); + if (HiVT != Hi.getValueType()) + Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi); } void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -14539,7 +14539,7 @@ // Extending v8i1/v16i1 to 512-bit get better performance on KNL // than extending to 128/256bit. unsigned VecSize = (NumElts <= 4 ? 128 : 512); - MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts); + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize / NumElts), NumElts); SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtVT.getVectorElementType(), Ext, Idx); @@ -14725,8 +14725,10 @@ if (!isa(Idx)) { // Non constant index. Extend source and destination, // insert element and then truncate the result. - MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); - MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); + unsigned NumElts = VecVT.getVectorNumElements(); + unsigned VecSize = (NumElts <= 4 ? 128 : 512); + MVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts); + MVT ExtEltVT = ExtVecVT.getVectorElementType(); SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); Index: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll +++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll @@ -1774,3 +1774,592 @@ %e = shufflevector <8 x i64> %d, <8 x i64> zeroinitializer, <8 x i32> ret <8 x i64> %e } + +define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) { +; KNL-LABEL: test_insertelement_variable_v32i1: +; KNL: ## %bb.0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $96, %rsp +; KNL-NEXT: ## kill: def %esi killed %esi def %rsi +; KNL-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; KNL-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; KNL-NEXT: andl $31, %esi +; KNL-NEXT: testb %dil, %dil +; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; KNL-NEXT: setne (%rsi,%rax) +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, (%rsp) +; KNL-NEXT: movl (%rsp), %eax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: test_insertelement_variable_v32i1: +; SKX: ## %bb.0: +; SKX-NEXT: pushq %rbp +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: ## kill: def %esi killed %esi def %rsi +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k1 +; SKX-NEXT: xorl %eax, %eax +; SKX-NEXT: testb %dil, %dil +; SKX-NEXT: setne %al +; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) +; SKX-NEXT: andl $31, %esi +; SKX-NEXT: movw %ax, (%rsp,%rsi,2) +; SKX-NEXT: vpsllw $15, (%rsp), %zmm0 +; SKX-NEXT: vpmovw2m %zmm0, %k0 +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %t1 = icmp ugt <32 x i8> %a, zeroinitializer + %t2 = icmp ugt i8 %b, 0 + %t3 = insertelement <32 x i1> %t1, i1 %t2, i32 %index + %t4 = bitcast <32 x i1> %t3 to i32 + ret i32 %t4 +} + +define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) { +; KNL-LABEL: test_insertelement_variable_v64i1: +; KNL: ## %bb.0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $192, %rsp +; KNL-NEXT: ## kill: def %esi killed %esi def %rsi +; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; KNL-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 +; KNL-NEXT: andl $63, %esi +; KNL-NEXT: testb %dil, %dil +; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; KNL-NEXT: setne (%rsi,%rax) +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0 +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL-NEXT: vpslld $31, %zmm2, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0 +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm1, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, (%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; KNL-NEXT: movl (%rsp), %eax +; KNL-NEXT: shlq $32, %rax +; KNL-NEXT: orq %rcx, %rax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: test_insertelement_variable_v64i1: +; SKX: ## %bb.0: +; SKX-NEXT: pushq %rbp +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: ## kill: def %esi killed %esi def %rsi +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 +; SKX-NEXT: andl $63, %esi +; SKX-NEXT: testb %dil, %dil +; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) +; SKX-NEXT: movq %rsp, %rax +; SKX-NEXT: setne (%rsi,%rax) +; SKX-NEXT: vpsllw $7, (%rsp), %zmm0 +; SKX-NEXT: vpmovb2m %zmm0, %k0 +; SKX-NEXT: kmovq %k0, %rax +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %t1 = icmp ugt <64 x i8> %a, zeroinitializer + %t2 = icmp ugt i8 %b, 0 + %t3 = insertelement <64 x i1> %t1, i1 %t2, i32 %index + %t4 = bitcast <64 x i1> %t3 to i64 + ret i64 %t4 +} + +define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) { +; KNL-LABEL: test_insertelement_variable_v96i1: +; KNL: ## %bb.0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-128, %rsp +; KNL-NEXT: subq $384, %rsp ## imm = 0x180 +; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vmovd %edi, %xmm2 +; KNL-NEXT: vpinsrb $1, %esi, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $4, %r8d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $5, %r9d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $2, 112(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $3, 120(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $4, 128(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $5, 136(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $6, 144(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $7, 152(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $8, 160(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $9, 168(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $10, 176(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $11, 184(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; KNL-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; KNL-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2 +; KNL-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; KNL-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1 +; KNL-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; KNL-NEXT: vpcmpgtb %ymm3, %ymm0, %ymm0 +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: movl 744(%rbp), %eax +; KNL-NEXT: andl $127, %eax +; KNL-NEXT: cmpb $0, 736(%rbp) +; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp) +; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; KNL-NEXT: setne (%rax,%rcx) +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2 +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3 +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0 +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm4 +; KNL-NEXT: vpmovsxbd %xmm4, %zmm4 +; KNL-NEXT: vpslld $31, %zmm4, %zmm4 +; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vextracti128 $1, %ymm2, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm2, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vextracti128 $1, %ymm3, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm3, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, (%rsp) +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: shlq $32, %rax +; KNL-NEXT: orq %rcx, %rax +; KNL-NEXT: movl (%rsp), %ecx +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edx +; KNL-NEXT: shlq $32, %rdx +; KNL-NEXT: orq %rcx, %rdx +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: test_insertelement_variable_v96i1: +; SKX: ## %bb.0: +; SKX-NEXT: pushq %rbp +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-128, %rsp +; SKX-NEXT: subq $256, %rsp ## imm = 0x100 +; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0 +; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SKX-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $6, 16(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $7, 24(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $8, 32(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $9, 40(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $10, 48(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $11, 56(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $12, 64(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $13, 72(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $14, 80(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $15, 88(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SKX-NEXT: vpinsrb $1, 104(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $2, 112(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $3, 120(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $4, 128(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $5, 136(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $6, 144(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $7, 152(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $8, 160(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $9, 168(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $10, 176(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $11, 184(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $12, 192(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $13, 200(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $14, 208(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $15, 216(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SKX-NEXT: vpinsrb $1, 488(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $2, 496(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $3, 504(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $4, 512(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $5, 520(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $6, 528(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $7, 536(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $8, 544(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $9, 552(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $10, 560(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $11, 568(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $12, 576(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $13, 584(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $14, 592(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $15, 600(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SKX-NEXT: vpinsrb $1, 616(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $2, 624(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $3, 632(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $4, 640(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $5, 648(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $6, 656(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $7, 664(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $8, 672(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $9, 680(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $10, 688(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $11, 696(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $12, 704(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $13, 712(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $14, 720(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpcmpnleub %zmm2, %zmm0, %k1 +; SKX-NEXT: vpcmpnleub %zmm2, %zmm1, %k2 +; SKX-NEXT: movl 744(%rbp), %eax +; SKX-NEXT: andl $127, %eax +; SKX-NEXT: cmpb $0, 736(%rbp) +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SKX-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} {z} +; SKX-NEXT: vmovdqa32 %zmm1, {{[0-9]+}}(%rsp) +; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) +; SKX-NEXT: movq %rsp, %rcx +; SKX-NEXT: setne (%rax,%rcx) +; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0 +; SKX-NEXT: vpmovb2m %zmm0, %k0 +; SKX-NEXT: vpsllw $7, (%rsp), %zmm0 +; SKX-NEXT: vpmovb2m %zmm0, %k1 +; SKX-NEXT: kmovq %k1, %rax +; SKX-NEXT: kmovq %k0, %rdx +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %t1 = icmp ugt <96 x i8> %a, zeroinitializer + %t2 = icmp ugt i8 %b, 0 + %t3 = insertelement <96 x i1> %t1, i1 %t2, i32 %index + %t4 = bitcast <96 x i1> %t3 to i96 + ret i96 %t4 +} + +define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index) { +; KNL-LABEL: test_insertelement_variable_v128i1: +; KNL: ## %bb.0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-128, %rsp +; KNL-NEXT: subq $384, %rsp ## imm = 0x180 +; KNL-NEXT: ## kill: def %esi killed %esi def %rsi +; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; KNL-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; KNL-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0 +; KNL-NEXT: vpxor %ymm4, %ymm1, %ymm1 +; KNL-NEXT: vpcmpgtb %ymm4, %ymm1, %ymm1 +; KNL-NEXT: vpxor %ymm4, %ymm2, %ymm2 +; KNL-NEXT: vpcmpgtb %ymm4, %ymm2, %ymm2 +; KNL-NEXT: vpxor %ymm4, %ymm3, %ymm3 +; KNL-NEXT: vpcmpgtb %ymm4, %ymm3, %ymm3 +; KNL-NEXT: andl $127, %esi +; KNL-NEXT: testb %dil, %dil +; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; KNL-NEXT: setne (%rsi,%rax) +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2 +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3 +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0 +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm4 +; KNL-NEXT: vpmovsxbd %xmm4, %zmm4 +; KNL-NEXT: vpslld $31, %zmm4, %zmm4 +; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vextracti128 $1, %ymm2, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm2, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vextracti128 $1, %ymm3, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm3, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, (%rsp) +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: shlq $32, %rax +; KNL-NEXT: orq %rcx, %rax +; KNL-NEXT: movl (%rsp), %ecx +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edx +; KNL-NEXT: shlq $32, %rdx +; KNL-NEXT: orq %rcx, %rdx +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: test_insertelement_variable_v128i1: +; SKX: ## %bb.0: +; SKX-NEXT: pushq %rbp +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-128, %rsp +; SKX-NEXT: subq $256, %rsp ## imm = 0x100 +; SKX-NEXT: ## kill: def %esi killed %esi def %rsi +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpcmpnleub %zmm2, %zmm0, %k1 +; SKX-NEXT: vpcmpnleub %zmm2, %zmm1, %k2 +; SKX-NEXT: andl $127, %esi +; SKX-NEXT: testb %dil, %dil +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SKX-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} {z} +; SKX-NEXT: vmovdqa32 %zmm1, {{[0-9]+}}(%rsp) +; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) +; SKX-NEXT: movq %rsp, %rax +; SKX-NEXT: setne (%rsi,%rax) +; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0 +; SKX-NEXT: vpmovb2m %zmm0, %k0 +; SKX-NEXT: vpsllw $7, (%rsp), %zmm0 +; SKX-NEXT: vpmovb2m %zmm0, %k1 +; SKX-NEXT: kmovq %k1, %rax +; SKX-NEXT: kmovq %k0, %rdx +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %t1 = icmp ugt <128 x i8> %a, zeroinitializer + %t2 = icmp ugt i8 %b, 0 + %t3 = insertelement <128 x i1> %t1, i1 %t2, i32 %index + %t4 = bitcast <128 x i1> %t3 to i128 + ret i128 %t4 +}