Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1453,6 +1453,8 @@ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom); setOperationAction(ISD::SELECT, MVT::v32i1, Custom); @@ -1538,29 +1540,24 @@ addRegisterClass(MVT::v4i1, &X86::VK4RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); - setOperationAction(ISD::ADD, MVT::v2i1, Expand); - setOperationAction(ISD::ADD, MVT::v4i1, Expand); - setOperationAction(ISD::SUB, MVT::v2i1, Expand); - setOperationAction(ISD::SUB, MVT::v4i1, Expand); - setOperationAction(ISD::MUL, MVT::v2i1, Expand); - setOperationAction(ISD::MUL, MVT::v4i1, Expand); - - setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); - setOperationAction(ISD::SETCC, MVT::v4i1, Custom); - setOperationAction(ISD::SETCC, MVT::v2i1, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); + for (auto VT : { MVT::v2i1, MVT::v4i1 }) { + setOperationAction(ISD::ADD, VT, Expand); + setOperationAction(ISD::SUB, VT, Expand); + setOperationAction(ISD::MUL, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + + setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + } + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); - setOperationAction(ISD::SELECT, MVT::v4i1, Custom); - setOperationAction(ISD::SELECT, MVT::v2i1, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom); - setOperationAction(ISD::VSELECT, MVT::v2i1, Expand); - setOperationAction(ISD::VSELECT, MVT::v4i1, Expand); for (auto VT : { MVT::v4i32, MVT::v8i32 }) { setOperationAction(ISD::AND, VT, Legal); @@ -12468,7 +12465,8 @@ } unsigned IdxVal = cast(Idx)->getZExtValue(); - if (!Subtarget.hasDQI() && (VecVT.getVectorNumElements() <= 8)) { + if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) || + (VecVT.getVectorNumElements() < 8)) { // Use kshiftlw/rw instruction. VecVT = MVT::v16i1; Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -2247,6 +2247,14 @@ def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK1)>; } + +let Predicates = [HasAVX512, HasVLX] in { + def : Pat<(i1 (X86Vextract VK2:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK2:$src, VK1)>; + def : Pat<(i1 (X86Vextract VK4:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK4:$src, VK1)>; +} + let Predicates = [HasBWI] in { def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), (COPY_TO_REGCLASS VK32:$src, VK1)>; Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -1017,3 +1017,127 @@ %r = insertelement <32 x i8> %x, i8 %y, i32 20 ret <32 x i8> %r } + +define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) { +; KNL-LABEL: test_extractelement_v2i1: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; KNL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: testb $1, %al +; KNL-NEXT: sete %al +; KNL-NEXT: addb $3, %al +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_v2i1: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0 +; SKX-NEXT: kshiftlw $15, %k0, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb %al, %al +; SKX-NEXT: sete %al +; SKX-NEXT: addb $3, %al +; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: retq + %t1 = icmp ugt <2 x i64> %a, %b + %t2 = extractelement <2 x i1> %t1, i32 0 + %res = select i1 %t2, i8 3, i8 4 + ret i8 %res +} + +define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) { +; KNL-LABEL: test_extractelement_v4i1: +; KNL: ## BB#0: +; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vmovd %xmm0, %eax +; KNL-NEXT: testb $1, %al +; KNL-NEXT: sete %al +; KNL-NEXT: addb $3, %al +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_v4i1: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 +; SKX-NEXT: kshiftlw $15, %k0, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb %al, %al +; SKX-NEXT: sete %al +; SKX-NEXT: addb $3, %al +; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: retq + %t1 = icmp ugt <4 x i32> %a, %b + %t2 = extractelement <4 x i1> %t1, i32 0 + %res = select i1 %t2, i8 3, i8 4 + ret i8 %res +} + +define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) { +; KNL-LABEL: test_extractelement_v32i1: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpextrb $0, %xmm0, %eax +; KNL-NEXT: testb $1, %al +; KNL-NEXT: sete %al +; KNL-NEXT: addb $3, %al +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_v32i1: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 +; SKX-NEXT: kshiftld $31, %k0, %k0 +; SKX-NEXT: kshiftrd $31, %k0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb %al, %al +; SKX-NEXT: sete %al +; SKX-NEXT: addb $3, %al +; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: retq + %t1 = icmp ugt <32 x i8> %a, %b + %t2 = extractelement <32 x i1> %t1, i32 0 + %res = select i1 %t2, i8 3, i8 4 + ret i8 %res +} + +define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) { +; KNL-LABEL: test_extractelement_v64i1: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; KNL-NEXT: vpxor %ymm1, %ymm2, %ymm2 +; KNL-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpextrb $0, %xmm0, %eax +; KNL-NEXT: testb $1, %al +; KNL-NEXT: sete %al +; KNL-NEXT: addb $3, %al +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_v64i1: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 +; SKX-NEXT: kshiftlq $63, %k0, %k0 +; SKX-NEXT: kshiftrq $63, %k0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb %al, %al +; SKX-NEXT: sete %al +; SKX-NEXT: addb $3, %al +; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: retq + %t1 = icmp ugt <64 x i8> %a, %b + %t2 = extractelement <64 x i1> %t1, i32 0 + %res = select i1 %t2, i8 3, i8 4 + ret i8 %res +} Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -1456,11 +1456,12 @@ ; SKX: # BB#0: ; SKX-NEXT: vpslld $31, %xmm2, %xmm2 ; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1 -; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) +; SKX-NEXT: kshiftlw $15, %k1, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 -; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SKX-NEXT: kmovw %k0, %eax ; SKX-NEXT: # implicit-def: %XMM0 ; SKX-NEXT: testb %al, %al ; SKX-NEXT: je .LBB29_2 @@ -1468,16 +1469,18 @@ ; SKX-NEXT: vmovq %xmm1, %rax ; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SKX-NEXT: .LBB29_2: # %else -; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) -; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SKX-NEXT: kshiftlw $14, %k1, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: kmovw %k0, %eax ; SKX-NEXT: testb %al, %al ; SKX-NEXT: je .LBB29_4 ; SKX-NEXT: # BB#3: # %cond.load1 ; SKX-NEXT: vpextrq $1, %xmm1, %rax ; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0 ; SKX-NEXT: .LBB29_4: # %else2 -; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) -; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SKX-NEXT: kshiftlw $13, %k1, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: kmovw %k0, %eax ; SKX-NEXT: testb %al, %al ; SKX-NEXT: je .LBB29_6 ; SKX-NEXT: # BB#5: # %cond.load4 @@ -1495,10 +1498,11 @@ ; SKX_32-NEXT: .cfi_def_cfa_offset 16 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2 ; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1 -; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp) +; SKX_32-NEXT: kshiftlw $15, %k1, %k0 +; SKX_32-NEXT: kshiftrw $15, %k0, %k0 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 ; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al +; SKX_32-NEXT: kmovw %k0, %eax ; SKX_32-NEXT: # implicit-def: %XMM0 ; SKX_32-NEXT: testb %al, %al ; SKX_32-NEXT: je .LBB29_2 @@ -1506,8 +1510,9 @@ ; SKX_32-NEXT: vmovd %xmm1, %eax ; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SKX_32-NEXT: .LBB29_2: # %else -; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp) -; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al +; SKX_32-NEXT: kshiftlw $14, %k1, %k0 +; SKX_32-NEXT: kshiftrw $15, %k0, %k0 +; SKX_32-NEXT: kmovw %k0, %eax ; SKX_32-NEXT: testb %al, %al ; SKX_32-NEXT: je .LBB29_4 ; SKX_32-NEXT: # BB#3: # %cond.load1 @@ -1515,8 +1520,9 @@ ; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0 ; SKX_32-NEXT: .LBB29_4: # %else2 ; SKX_32-NEXT: vmovdqa32 {{[0-9]+}}(%esp), %xmm2 -; SKX_32-NEXT: kmovb %k1, (%esp) -; SKX_32-NEXT: movb (%esp), %al +; SKX_32-NEXT: kshiftlw $13, %k1, %k0 +; SKX_32-NEXT: kshiftrw $15, %k0, %k0 +; SKX_32-NEXT: kmovw %k0, %eax ; SKX_32-NEXT: testb %al, %al ; SKX_32-NEXT: je .LBB29_6 ; SKX_32-NEXT: # BB#5: # %cond.load4