Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1054,9 +1054,20 @@ if (CustomLowerNode(N, N->getValueType(0), true)) return; - // Spill the vector to the stack. + // Make the vector elements byte-addressable if they aren't already. EVT VecVT = Vec.getValueType(); EVT EltVT = VecVT.getVectorElementType(); + if (VecVT.getScalarSizeInBits() < 8) { + EltVT = MVT::i8; + VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, + VecVT.getVectorNumElements()); + Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec); + // Extend the element type to match if needed. + if (EltVT.bitsGT(Elt.getValueType())) + Elt = DAG.getNode(ISD::ANY_EXTEND, dl, EltVT, Elt); + } + + // Spill the vector to the stack. SDValue StackPtr = DAG.CreateStackTemporary(VecVT); auto &MF = DAG.getMachineFunction(); auto FrameIndex = cast(StackPtr.getNode())->getIndex(); @@ -1071,19 +1082,29 @@ Store = DAG.getTruncStore(Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT); + // Load the Lo part from the stack slot. - Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo); + Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo); // Increment the pointer to the other part. - unsigned IncrementSize = Lo.getValueSizeInBits() / 8; + unsigned IncrementSize = LoVT.getSizeInBits() / 8; StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr, DAG.getConstant(IncrementSize, dl, StackPtr.getValueType())); // Load the Hi part from the stack slot. - Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, + Hi = DAG.getLoad(HiVT, dl, Store, StackPtr, PtrInfo.getWithOffset(IncrementSize), MinAlign(Alignment, IncrementSize)); + + // If we adjusted the original type, we need to truncate the results. + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + if (LoVT != Lo.getValueType()) + Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Lo); + if (HiVT != Hi.getValueType()) + Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi); } void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -14681,8 +14681,10 @@ if (!isa(Idx)) { // Non constant index. Extend source and destination, // insert element and then truncate the result. - MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); - MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); + unsigned NumElts = VecVT.getVectorNumElements(); + unsigned VecSize = (NumElts <= 4 ? 128 : 512); + MVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts); + MVT ExtEltVT = ExtVecVT.getVectorElementType(); SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -1912,3 +1912,154 @@ %e = shufflevector <8 x i64> %d, <8 x i64> zeroinitializer, <8 x i32> ret <8 x i64> %e } + +define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) { +; KNL-LABEL: test_insertelement_variable_v32i1: +; KNL: ## %bb.0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $96, %rsp +; KNL-NEXT: ## kill: %esi %esi %rsi +; KNL-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; KNL-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; KNL-NEXT: andl $31, %esi +; KNL-NEXT: testb %dil, %dil +; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; KNL-NEXT: setne (%rsi,%rax) +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, (%rsp) +; KNL-NEXT: movl (%rsp), %eax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: test_insertelement_variable_v32i1: +; SKX: ## %bb.0: +; SKX-NEXT: pushq %rbp +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: ## kill: %esi %esi %rsi +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k1 +; SKX-NEXT: xorl %eax, %eax +; SKX-NEXT: testb %dil, %dil +; SKX-NEXT: setne %al +; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) +; SKX-NEXT: andl $31, %esi +; SKX-NEXT: movw %ax, (%rsp,%rsi,2) +; SKX-NEXT: vpsllw $15, (%rsp), %zmm0 +; SKX-NEXT: vpmovw2m %zmm0, %k0 +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %t1 = icmp ugt <32 x i8> %a, zeroinitializer + %t2 = icmp ugt i8 %b, 0 + %t3 = insertelement <32 x i1> %t1, i1 %t2, i32 %index + %t4 = bitcast <32 x i1> %t3 to i32 + ret i32 %t4 +} + +define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) { +; KNL-LABEL: test_insertelement_variable_v64i1: +; KNL: ## %bb.0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $192, %rsp +; KNL-NEXT: ## kill: %esi %esi %rsi +; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; KNL-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 +; KNL-NEXT: andl $63, %esi +; KNL-NEXT: testb %dil, %dil +; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; KNL-NEXT: setne (%rsi,%rax) +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0 +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL-NEXT: vpslld $31, %zmm2, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0 +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpmovsxbd %xmm1, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, (%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; KNL-NEXT: movl (%rsp), %eax +; KNL-NEXT: shlq $32, %rax +; KNL-NEXT: orq %rcx, %rax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: test_insertelement_variable_v64i1: +; SKX: ## %bb.0: +; SKX-NEXT: pushq %rbp +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: ## kill: %esi %esi %rsi +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 +; SKX-NEXT: andl $63, %esi +; SKX-NEXT: testb %dil, %dil +; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) +; SKX-NEXT: movq %rsp, %rax +; SKX-NEXT: setne (%rsi,%rax) +; SKX-NEXT: vpsllw $7, (%rsp), %zmm0 +; SKX-NEXT: vpmovb2m %zmm0, %k0 +; SKX-NEXT: kmovq %k0, %rax +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %t1 = icmp ugt <64 x i8> %a, zeroinitializer + %t2 = icmp ugt i8 %b, 0 + %t3 = insertelement <64 x i1> %t1, i1 %t2, i32 %index + %t4 = bitcast <64 x i1> %t3 to i64 + ret i64 %t4 +}