Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -13554,21 +13554,39 @@ MVT InVT = In.getSimpleValueType(); assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type."); + assert((InVT.is256BitVector() || InVT.is128BitVector() || + InVT.is512BitVector()) && "Unexpected vector type."); // Shift LSB to MSB and use VPMOVB2M - SKX. unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; - if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M - ((InVT.is256BitVector() || InVT.is128BitVector()) && - InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() && - Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M - // Shift packed bytes not supported natively, bitcast to dword + + if (InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI()) { + // legal, will go to VPMOVB2M, VPMOVW2M + // Shift packed bytes not supported natively, bitcast to word MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, DAG.getBitcast(ExtVT, In), DAG.getConstant(ShiftInx, DL, ExtVT)); ShiftNode = DAG.getBitcast(InVT, ShiftNode); - return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + + if((InVT.is256BitVector() || InVT.is128BitVector()) && + !Subtarget->hasVLX()) { + // use 512bit instruction + MVT EltVT = InVT.getVectorElementType(); + MVT NewInVT = MVT::getVectorVT(EltVT, 512 / EltVT.getSizeInBits()); + MVT NewVT = MVT::getVectorVT(MVT::i1, NewInVT.getVectorNumElements()); + SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewInVT, + DAG.getUNDEF(NewInVT), + ShiftNode, + DAG.getIntPtrConstant(0, DL)); + + SDValue CVT2MaskNode = DAG.getNode(X86ISD::CVT2MASK, DL, NewVT, Vec512); + + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CVT2MaskNode, + DAG.getIntPtrConstant(0, DL)); + } else { + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } } if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M @@ -13581,19 +13599,38 @@ return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); } + // Shift LSB to MSB, extend if necessary and use TESTM. unsigned NumElts = InVT.getVectorNumElements(); - if (InVT.getSizeInBits() < 512 && - (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 || - !Subtarget->hasVLX())) { - assert((NumElts == 8 || NumElts == 16) && "Unexpected vector type."); - - // TESTD/Q should be used (if BW supported we use CVT2MASK above), - // so vector should be extended to packed dword/qword. - MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); - In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); - InVT = ExtVT; - ShiftInx = InVT.getScalarSizeInBits() - 1; + if (InVT.getSizeInBits() < 512) { + if ((InVT.getScalarType() == MVT::i32 || InVT.getScalarType() == MVT::i64 ) && + !Subtarget->hasVLX()) { + // use 512bit instruction + MVT EltVT = InVT.getVectorElementType(); + MVT NewInVT = MVT::getVectorVT(EltVT, 512 / EltVT.getSizeInBits()); + MVT NewVT = MVT::getVectorVT(MVT::i1, NewInVT.getVectorNumElements()); + SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewInVT, + DAG.getUNDEF(NewInVT), + In, + DAG.getIntPtrConstant(0, DL)); + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, NewInVT, Vec512, + DAG.getConstant(ShiftInx, DL, NewInVT)); + SDValue TestNode= DAG.getNode(X86ISD::TESTM, DL, NewVT, ShiftNode, + ShiftNode); + + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, TestNode, + DAG.getIntPtrConstant(0, DL)); + } else if (InVT.getScalarType() == MVT::i8 || + InVT.getScalarType() == MVT::i16 ) { + assert((NumElts == 8 || NumElts == 16) && "Unexpected vector type."); + // TESTD/Q should be used (if BW supported we use CVT2MASK above), + // so vector should be extended to packed dword/qword. + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); + InVT = ExtVT; + ShiftInx = InVT.getScalarSizeInBits() - 1; + } } SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -791,6 +791,14 @@ (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), sub_ymm)>; +def : Pat<(insert_subvector undef, (v8i16 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), + (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; +def : Pat<(insert_subvector undef, (v16i8 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), + (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; def : Pat<(insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0)), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; @@ -2474,15 +2482,18 @@ } def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))), (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>; - -def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), - (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>; +def : Pat<(v8i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK32:$src, VK8))>; +def : Pat<(v8i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK64:$src, VK8))>; def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))), (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>; +def : Pat<(v16i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))), + (v16i1 (COPY_TO_REGCLASS VK64:$src, VK16))>; def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))), (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>; @@ -2507,6 +2518,9 @@ def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; +def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), + (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>; + def : Pat<(v32i1 (insert_subvector undef, VK2:$src, (iPTR 0))), (v32i1 (COPY_TO_REGCLASS VK2:$src, VK32))>; def : Pat<(v32i1 (insert_subvector undef, VK4:$src, (iPTR 0))), Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -679,9 +679,8 @@ ; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2 -; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm0 -; KNL_64-NEXT: vpsllq $63, %zmm0, %zmm0 -; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL_64-NEXT: vpslld $31, %zmm1, %zmm0 +; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1} ; KNL_64-NEXT: retq ; @@ -691,9 +690,8 @@ ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2 -; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm0 -; KNL_32-NEXT: vpsllvq .LCPI14_0, %zmm0, %zmm0 -; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL_32-NEXT: vpslld $31, %zmm1, %zmm0 +; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1} ; KNL_32-NEXT: retl ; @@ -833,9 +831,8 @@ ; KNL_64: # BB#0: ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2 -; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2 -; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 +; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; KNL_64-NEXT: retq ; @@ -844,9 +841,8 @@ ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2 -; KNL_32-NEXT: vpsllvq .LCPI17_0, %zmm2, %zmm2 -; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_32-NEXT: vpslld $31, %zmm2, %zmm2 +; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; KNL_32-NEXT: retl ; @@ -922,9 +918,8 @@ ; KNL_64-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2 -; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2 -; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 +; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} ; KNL_64-NEXT: retq ; @@ -936,9 +931,8 @@ ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2 -; KNL_32-NEXT: vpsllvq .LCPI19_0, %zmm2, %zmm2 -; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_32-NEXT: vpslld $31, %zmm2, %zmm2 +; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} ; KNL_32-NEXT: retl ; @@ -1024,9 +1018,8 @@ ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 -; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 -; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_64-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1} ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 ; KNL_64-NEXT: retq @@ -1040,9 +1033,8 @@ ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 -; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_32-NEXT: vpsllvq .LCPI21_0, %zmm1, %zmm1 -; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1} ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 ; KNL_32-NEXT: retl Index: test/CodeGen/X86/masked_memop.ll =================================================================== --- test/CodeGen/X86/masked_memop.ll +++ test/CodeGen/X86/masked_memop.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512NoVL --check-prefix=AVX512F +; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512NoVL --check-prefix=AVX512BW ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=SKX ; To test for the case where masked load/store is not legal, we should add a run with a target @@ -194,13 +195,13 @@ ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test5: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1} -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test5: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512NoVL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; AVX512NoVL-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; AVX512NoVL-NEXT: vmovaps %zmm1, %zmm0 +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test5: ; SKX: ## BB#0: @@ -223,13 +224,13 @@ ; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: test6: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 -; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test6: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512NoVL-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX512NoVL-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 +; AVX512NoVL-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test6: ; SKX: ## BB#0: @@ -252,13 +253,13 @@ ; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: test7: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test7: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512NoVL-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX512NoVL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 +; AVX512NoVL-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test7: ; SKX: ## BB#0: @@ -289,13 +290,13 @@ ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test8: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 -; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test8: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512NoVL-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX512NoVL-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 +; AVX512NoVL-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test8: ; SKX: ## BB#0: @@ -324,12 +325,12 @@ ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test9: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test9: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512NoVL-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX512NoVL-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test9: ; SKX: ## BB#0: @@ -364,14 +365,14 @@ ; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test10: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test10: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512NoVL-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX512NoVL-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512NoVL-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 +; AVX512NoVL-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test10: ; SKX: ## BB#0: @@ -405,13 +406,13 @@ ; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test10b: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test10b: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512NoVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512NoVL-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512NoVL-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test10b: ; SKX: ## BB#0: @@ -444,15 +445,15 @@ ; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test11a: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 -; AVX512F-NEXT: kshiftlw $8, %k0, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: vmovups (%rdi), %zmm1 {%k1} -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test11a: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512NoVL-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 +; AVX512NoVL-NEXT: kshiftlw $8, %k0, %k0 +; AVX512NoVL-NEXT: kshiftrw $8, %k0, %k1 +; AVX512NoVL-NEXT: vmovups (%rdi), %zmm1 {%k1} +; AVX512NoVL-NEXT: vmovaps %zmm1, %zmm0 +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test11a: ; SKX: ## BB#0: @@ -500,6 +501,16 @@ ; AVX512F-NEXT: vmovaps %zmm1, %zmm0 ; AVX512F-NEXT: retq ; +; AVX512BW-LABEL: test11b: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 +; AVX512BW-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; ; SKX-LABEL: test11b: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 @@ -542,6 +553,15 @@ ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; +; AVX512BW-LABEL: test11c: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 +; AVX512BW-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; ; SKX-LABEL: test11c: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 @@ -583,6 +603,15 @@ ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; +; AVX512BW-LABEL: test11d: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 +; AVX512BW-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; ; SKX-LABEL: test11d: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 @@ -613,14 +642,14 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test12: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 -; AVX512F-NEXT: kshiftlw $8, %k0, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test12: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512NoVL-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 +; AVX512NoVL-NEXT: kshiftlw $8, %k0, %k0 +; AVX512NoVL-NEXT: kshiftrw $8, %k0, %k1 +; AVX512NoVL-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test12: ; SKX: ## BB#0: @@ -692,15 +721,15 @@ ; AVX2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test14: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512F-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test14: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512NoVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512NoVL-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX512NoVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512NoVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512NoVL-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test14: ; SKX: ## BB#0: @@ -739,16 +768,16 @@ ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test15: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test15: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512NoVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512NoVL-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX512NoVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512NoVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512NoVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512NoVL-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test15: ; SKX: ## BB#0: @@ -785,16 +814,16 @@ ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test16: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test16: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512NoVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512NoVL-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX512NoVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512NoVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512NoVL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 +; AVX512NoVL-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test16: ; SKX: ## BB#0: @@ -838,18 +867,18 @@ ; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test17: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512F-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test17: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512NoVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512NoVL-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX512NoVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512NoVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512NoVL-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 +; AVX512NoVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512NoVL-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512NoVL-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test17: ; SKX: ## BB#0: @@ -888,15 +917,15 @@ ; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test18: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512F-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test18: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512NoVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512NoVL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX512NoVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512NoVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512NoVL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test18: ; SKX: ## BB#0: @@ -919,11 +948,11 @@ ; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: test19: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test19: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512NoVL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test19: ; SKX: ## BB#0: @@ -943,12 +972,12 @@ ; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: test20: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,4294967295,4294967295] -; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 -; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test20: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,4294967295,4294967295] +; AVX512NoVL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 +; AVX512NoVL-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test20: ; SKX: ## BB#0: @@ -975,11 +1004,11 @@ ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test21: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test21: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512NoVL-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test21: ; SKX: ## BB#0: @@ -1006,12 +1035,12 @@ ; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test22: -; AVX512F: ## BB#0: -; AVX512F-NEXT: movl $-1, %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) -; AVX512F-NEXT: retq +; AVX512NoVL-LABEL: test22: +; AVX512NoVL: ## BB#0: +; AVX512NoVL-NEXT: movl $-1, %eax +; AVX512NoVL-NEXT: vmovd %eax, %xmm0 +; AVX512NoVL-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) +; AVX512NoVL-NEXT: retq ; ; SKX-LABEL: test22: ; SKX: ## BB#0: @@ -1183,6 +1212,15 @@ ; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} ; AVX512F-NEXT: retq ; +; AVX512BW-LABEL: test24: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} +; AVX512BW-NEXT: retq +; ; SKX-LABEL: test24: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 @@ -1274,6 +1312,15 @@ ; AVX512F-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1} ; AVX512F-NEXT: retq ; +; AVX512BW-LABEL: test_store_16i64: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1} +; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1} +; AVX512BW-NEXT: retq +; ; SKX-LABEL: test_store_16i64: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 @@ -1366,6 +1413,15 @@ ; AVX512F-NEXT: vmovupd %zmm2, 64(%rdi) {%k1} ; AVX512F-NEXT: retq ; +; AVX512BW-LABEL: test_store_16f64: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: vmovupd %zmm1, (%rdi) {%k1} +; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: vmovupd %zmm2, 64(%rdi) {%k1} +; AVX512BW-NEXT: retq +; ; SKX-LABEL: test_store_16f64: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 @@ -1468,6 +1524,17 @@ ; AVX512F-NEXT: vmovaps %zmm2, %zmm1 ; AVX512F-NEXT: retq ; +; AVX512BW-LABEL: test_load_16i64: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} +; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovaps %zmm2, %zmm1 +; AVX512BW-NEXT: retq +; ; SKX-LABEL: test_load_16i64: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 @@ -1572,6 +1639,17 @@ ; AVX512F-NEXT: vmovaps %zmm2, %zmm1 ; AVX512F-NEXT: retq ; +; AVX512BW-LABEL: test_load_16f64: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovaps %zmm2, %zmm1 +; AVX512BW-NEXT: retq +; ; SKX-LABEL: test_load_16f64: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 @@ -1797,6 +1875,23 @@ ; AVX512F-NEXT: vmovaps %zmm4, %zmm3 ; AVX512F-NEXT: retq ; +; AVX512BW-LABEL: test_load_32f64: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: vmovupd 128(%rdi), %zmm3 {%k2} +; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} +; AVX512BW-NEXT: kshiftrw $8, %k2, %k1 +; AVX512BW-NEXT: vmovupd 192(%rdi), %zmm4 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovaps %zmm2, %zmm1 +; AVX512BW-NEXT: vmovaps %zmm3, %zmm2 +; AVX512BW-NEXT: vmovaps %zmm4, %zmm3 +; AVX512BW-NEXT: retq +; ; SKX-LABEL: test_load_32f64: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 @@ -1816,4 +1911,253 @@ %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) ret <32 x double> %res } + +define <32 x i64> @test_load_32i64(<32 x i64>* %ptrs, <32 x i1> %mask, <32 x i64> %src0) { +; AVX1-LABEL: test_load_32i64: +; AVX1: ## BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: Ltmp3: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: Ltmp4: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: Ltmp5: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $32, %rsp +; AVX1-NEXT: vmovapd 16(%rbp), %ymm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm9, %xmm9 +; AVX1-NEXT: vpsrad $31, %xmm9, %xmm9 +; AVX1-NEXT: vpmovsxdq %xmm9, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm9, %xmm9 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-NEXT: vmaskmovpd 32(%rsi), %ymm9, %ymm10 +; AVX1-NEXT: vblendvpd %ymm9, %ymm10, %ymm2, %ymm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm10, %ymm2 +; AVX1-NEXT: vmaskmovpd 64(%rsi), %ymm2, %ymm10 +; AVX1-NEXT: vblendvpd %ymm2, %ymm10, %ymm3, %ymm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm10, %ymm2 +; AVX1-NEXT: vmaskmovpd 96(%rsi), %ymm2, %ymm10 +; AVX1-NEXT: vblendvpd %ymm2, %ymm10, %ymm4, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX1-NEXT: vmaskmovpd 160(%rsi), %ymm3, %ymm10 +; AVX1-NEXT: vblendvpd %ymm3, %ymm10, %ymm6, %ymm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX1-NEXT: vmaskmovpd 192(%rsi), %ymm3, %ymm10 +; AVX1-NEXT: vblendvpd %ymm3, %ymm10, %ymm7, %ymm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX1-NEXT: vmaskmovpd 224(%rsi), %ymm3, %ymm10 +; AVX1-NEXT: vblendvpd %ymm3, %ymm10, %ymm8, %ymm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 +; AVX1-NEXT: vmaskmovpd (%rsi), %ymm0, %ymm8 +; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm1, %ymm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmaskmovpd 128(%rsi), %ymm1, %ymm2 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm5, %ymm1 +; AVX1-NEXT: vmovapd %ymm1, 128(%rdi) +; AVX1-NEXT: vmovapd %ymm0, (%rdi) +; AVX1-NEXT: vmovapd %ymm3, 224(%rdi) +; AVX1-NEXT: vmovapd %ymm7, 192(%rdi) +; AVX1-NEXT: vmovapd %ymm6, 160(%rdi) +; AVX1-NEXT: vmovapd %ymm4, 96(%rdi) +; AVX1-NEXT: vmovapd %ymm11, 64(%rdi) +; AVX1-NEXT: vmovapd %ymm9, 32(%rdi) +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_load_32i64: +; AVX2: ## BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: Ltmp3: +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: Ltmp4: +; AVX2-NEXT: .cfi_offset %rbp, -16 +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: Ltmp5: +; AVX2-NEXT: .cfi_def_cfa_register %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $32, %rsp +; AVX2-NEXT: vmovapd 16(%rbp), %ymm8 +; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm9, %xmm9 +; AVX2-NEXT: vpsrad $31, %xmm9, %xmm9 +; AVX2-NEXT: vpmovsxdq %xmm9, %ymm9 +; AVX2-NEXT: vpmaskmovq 32(%rsi), %ymm9, %ymm10 +; AVX2-NEXT: vblendvpd %ymm9, %ymm10, %ymm2, %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT: vpmaskmovq 64(%rsi), %ymm2, %ymm10 +; AVX2-NEXT: vblendvpd %ymm2, %ymm10, %ymm3, %ymm11 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT: vpmaskmovq 96(%rsi), %ymm2, %ymm10 +; AVX2-NEXT: vblendvpd %ymm2, %ymm10, %ymm4, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3 +; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpmaskmovq 160(%rsi), %ymm3, %ymm10 +; AVX2-NEXT: vblendvpd %ymm3, %ymm10, %ymm6, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm6, %xmm6 +; AVX2-NEXT: vpsrad $31, %xmm6, %xmm6 +; AVX2-NEXT: vpmovsxdq %xmm6, %ymm6 +; AVX2-NEXT: vpmaskmovq 192(%rsi), %ymm6, %ymm10 +; AVX2-NEXT: vblendvpd %ymm6, %ymm10, %ymm7, %ymm6 +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm7, %xmm7 +; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7 +; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7 +; AVX2-NEXT: vpmaskmovq 224(%rsi), %ymm7, %ymm10 +; AVX2-NEXT: vblendvpd %ymm7, %ymm10, %ymm8, %ymm7 +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vpmaskmovq (%rsi), %ymm0, %ymm8 +; AVX2-NEXT: vblendvpd %ymm0, %ymm8, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpmaskmovq 128(%rsi), %ymm1, %ymm2 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm5, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, 128(%rdi) +; AVX2-NEXT: vmovapd %ymm0, (%rdi) +; AVX2-NEXT: vmovapd %ymm7, 224(%rdi) +; AVX2-NEXT: vmovapd %ymm6, 192(%rdi) +; AVX2-NEXT: vmovapd %ymm3, 160(%rdi) +; AVX2-NEXT: vmovapd %ymm4, 96(%rdi) +; AVX2-NEXT: vmovapd %ymm11, 64(%rdi) +; AVX2-NEXT: vmovapd %ymm9, 32(%rdi) +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_load_32i64: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5 +; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5 +; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm3 {%k1} +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 {%k2} +; AVX512F-NEXT: kshiftrw $8, %k1, %k1 +; AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm4 {%k1} +; AVX512F-NEXT: kshiftrw $8, %k2, %k1 +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} +; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovaps %zmm2, %zmm1 +; AVX512F-NEXT: vmovaps %zmm3, %zmm2 +; AVX512F-NEXT: vmovaps %zmm4, %zmm3 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: test_load_32i64: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} +; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm3 {%k2} +; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} +; AVX512BW-NEXT: kshiftrw $8, %k2, %k1 +; AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm4 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovaps %zmm2, %zmm1 +; AVX512BW-NEXT: vmovaps %zmm3, %zmm2 +; AVX512BW-NEXT: vmovaps %zmm4, %zmm3 +; AVX512BW-NEXT: retq +; +; SKX-LABEL: test_load_32i64: +; SKX: ## BB#0: +; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 +; SKX-NEXT: vpmovb2m %ymm0, %k1 +; SKX-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} +; SKX-NEXT: kshiftrd $16, %k1, %k2 +; SKX-NEXT: vmovdqu64 128(%rdi), %zmm3 {%k2} +; SKX-NEXT: kshiftrw $8, %k1, %k1 +; SKX-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} +; SKX-NEXT: kshiftrw $8, %k2, %k1 +; SKX-NEXT: vmovdqu64 192(%rdi), %zmm4 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: vmovaps %zmm2, %zmm1 +; SKX-NEXT: vmovaps %zmm3, %zmm2 +; SKX-NEXT: vmovaps %zmm4, %zmm3 +; SKX-NEXT: retq + %res = call <32 x i64> @llvm.masked.load.v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0) + ret <32 x i64> %res +} + +declare <32 x i64> @llvm.masked.load.v32i64(<32 x i64>* %ptrs, i32, <32 x i1> %mask, <32 x i64> %src0) declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)