Index: ../lib/Target/X86/X86ISelLowering.cpp =================================================================== --- ../lib/Target/X86/X86ISelLowering.cpp +++ ../lib/Target/X86/X86ISelLowering.cpp @@ -1381,6 +1381,8 @@ setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); if (Subtarget->hasDQI()) { setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); @@ -1600,6 +1602,8 @@ setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom); setOperationAction(ISD::SMAX, MVT::v64i8, Legal); setOperationAction(ISD::SMAX, MVT::v32i16, Legal); @@ -1645,6 +1649,8 @@ setOperationAction(ISD::SELECT, MVT::v2i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom); setOperationAction(ISD::AND, MVT::v8i32, Legal); setOperationAction(ISD::OR, MVT::v8i32, Legal); @@ -10787,6 +10793,50 @@ return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); } +// Lower vXi1 vector shuffles. +// There is no a dedicated instruction on AVX-512 that shuffles the masks. +// The only way to shuffle bits is to sign-extend the mask vector to SIMD +// vector, shuffle and then truncate it back. +static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Subtarget->hasAVX512() && + "Cannot lower 512-bit vectors w/o basic ISA!"); + EVT ExtVT; + switch (VT.SimpleTy) { + default: + assert(false && "Expected a vector of i1 elements"); + break; + case MVT::v2i1: + ExtVT = MVT::v2i64; + break; + case MVT::v4i1: + ExtVT = MVT::v4i32; + break; + case MVT::v8i1: + ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL + break; + case MVT::v16i1: + ExtVT = MVT::v16i32; + break; + case MVT::v32i1: + ExtVT = MVT::v32i16; + break; + case MVT::v64i1: + ExtVT = MVT::v64i8; + break; + } + V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); + if (V2.getOpcode() == ISD::UNDEF) + V2 = DAG.getUNDEF(ExtVT); + else + V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); + return DAG.getNode(ISD::TRUNCATE, DL, VT, + DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask)); +} /// \brief Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 @@ -10919,6 +10969,8 @@ if (VT.getSizeInBits() == 512) return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + if (VT.getScalarType() == MVT::i1) + return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } Index: ../test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- ../test/CodeGen/X86/avx512-mask-op.ll +++ ../test/CodeGen/X86/avx512-mask-op.ll @@ -361,39 +361,6 @@ ret <8 x i1>%d } -; KNL-LABEL: test19 -; KNL: movzbl %dil, %eax -; KNL: kmovw %eax, %k0 -; KNL: kshiftlw $13, %k0, %k0 -; KNL: kshiftrw $15, %k0, %k0 -; KNL: kmovw %k0, %eax -; KNL: andl $1, %eax -; KNL: testb %al, %al - -define <8 x i1> @test19(i8 %a) { - %b = bitcast i8 %a to <8 x i1> - %c = shufflevector < 8 x i1>%b, <8 x i1>undef, <8 x i32> - ret <8 x i1> %c -} - -; KNL-LABEL: test20 -; KNL: movzbl %dil, %eax -; KNL: kmovw %eax, %k0 -; KNL: kshiftlw $13, %k0, %k1 -; KNL: kshiftrw $15, %k1, %k1 -; KNL: kshiftlw $12, %k0, %k0 -; KNL: kshiftrw $15, %k0, %k0 -; KNL: kshiftlw $4, %k0, %k0 -; KNL: kshiftlw $1, %k1, %k2 -; KNL: korw %k0, %k2, %k0 -; KNL: kshiftlw $6, %k1, %k1 -; KNL: korw %k1, %k0, %k1 -define <8 x i1> @test20(i8 %a, i16 %y) { - %b = bitcast i8 %a to <8 x i1> - %c = shufflevector < 8 x i1>%b, <8 x i1>undef, <8 x i32> - ret <8 x i1> %c -} - ; KNL-LABEL: test21 ; KNL: vpand %ymm ; KNL: vextracti128 $1, %ymm2 Index: ../test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- ../test/CodeGen/X86/vector-shuffle-v1.ll +++ ../test/CodeGen/X86/vector-shuffle-v1.ll @@ -0,0 +1,184 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ + +target triple = "x86_64-unknown-unknown" + +define <2 x i1> @test1(<2 x i1> %a) { +; AVX512F-LABEL: test1: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: test1: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 +; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 +; VL_BW_DQ-NEXT: retq + %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> + ret <2 x i1> %b +} + +define <4 x i1> @test2(<4 x i1> %a) { +; AVX512F-LABEL: test2: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: test2: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 +; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 +; VL_BW_DQ-NEXT: retq + %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> + ret <4 x i1> %b +} + +define <8 x i1> @test3(<8 x i64> %a, <8 x i64> %b, <8 x i64>%a1, <8 x i64>%b1) { +; AVX512F-LABEL: test3: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0] +; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: test3: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 +; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] +; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 +; VL_BW_DQ-NEXT: retq + %a2 = icmp eq <8 x i64> %a, %a1 + %b2 = icmp eq <8 x i64> %b, %b1 + %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> + ret <8 x i1> %c +} + +define <16 x i1> @test4(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) { +; AVX512F-LABEL: test4: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 +; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 +; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm1 {%k2} {z} +; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm2 {%k1} {z} +; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: test4: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 +; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1 +; VL_BW_DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; VL_BW_DQ-NEXT: vpmovd2m %zmm1, %k0 +; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 +; VL_BW_DQ-NEXT: retq + %a2 = icmp eq <16 x i32> %a, %a1 + %b2 = icmp eq <16 x i32> %b, %b1 + %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> + ret <16 x i1> %c +} + +define <32 x i1> @test5(<32 x i1> %a) { +; AVX512F-LABEL: test5: +; AVX512F: # BB#0: +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: test5: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 +; VL_BW_DQ-NEXT: vmovdqu16 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0 +; VL_BW_DQ-NEXT: retq + %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> + ret <32 x i1> %b +} + +define <8 x i1> @test6(i8 %a) { +; AVX512F-LABEL: test6: +; AVX512F: # BB#0: +; AVX512F-NEXT: movzbl %dil, %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2 +; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: test6: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: kmovb %edi, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 +; VL_BW_DQ-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 +; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 +; VL_BW_DQ-NEXT: retq + %b = bitcast i8 %a to <8 x i1> + %c = shufflevector < 8 x i1>%b, <8 x i1>undef, <8 x i32> + ret <8 x i1> %c +} + +define <8 x i1> @test7(i8 %a, i16 %y) { +; AVX512F-LABEL: test7: +; AVX512F: # BB#0: +; AVX512F-NEXT: movzbl %dil, %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: retq +; +; VL_BW_DQ-LABEL: test7: +; VL_BW_DQ: # BB#0: +; VL_BW_DQ-NEXT: kmovb %edi, %k0 +; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 +; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 +; VL_BW_DQ-NEXT: retq + %b = bitcast i8 %a to <8 x i1> + %c = shufflevector < 8 x i1>%b, <8 x i1>undef, <8 x i32> + ret <8 x i1> %c +} +