Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -13555,47 +13555,29 @@ assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type."); - // Shift LSB to MSB and use VPMOVB2M - SKX. + // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q. unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; - if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M - ((InVT.is256BitVector() || InVT.is128BitVector()) && - InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() && - Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M - // Shift packed bytes not supported natively, bitcast to dword - MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); - SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, - DAG.getBitcast(ExtVT, In), - DAG.getConstant(ShiftInx, DL, ExtVT)); - ShiftNode = DAG.getBitcast(InVT, ShiftNode); - return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); - } - if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && - Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M - ((InVT.is256BitVector() || InVT.is128BitVector()) && - InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() && - Subtarget->hasVLX())) { // legal, will go to VPMOVD2M, VPMOVQ2M - - SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, - DAG.getConstant(ShiftInx, DL, InVT)); - return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); - } - - // Shift LSB to MSB, extend if necessary and use TESTM. - unsigned NumElts = InVT.getVectorNumElements(); - if (InVT.getSizeInBits() < 512 && - (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 || - !Subtarget->hasVLX())) { - assert((NumElts == 8 || NumElts == 16) && "Unexpected vector type."); - - // TESTD/Q should be used (if BW supported we use CVT2MASK above), - // so vector should be extended to packed dword/qword. - MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); - In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); - InVT = ExtVT; - ShiftInx = InVT.getScalarSizeInBits() - 1; + if (InVT.getScalarSizeInBits() <= 16) { + if (Subtarget->hasBWI()) { + // legal, will go to VPMOVB2M, VPMOVW2M + // Shift packed bytes not supported natively, bitcast to word + MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, + DAG.getBitcast(ExtVT, In), + DAG.getConstant(ShiftInx, DL, ExtVT)); + ShiftNode = DAG.getBitcast(InVT, ShiftNode); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } else { + // Use TESTD/Q, extended vector to packed dword/qword. + assert((InVT.is256BitVector() || InVT.is128BitVector()) && + "Unexpected vector type."); + unsigned NumElts = InVT.getVectorNumElements(); + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); + InVT = ExtVT; + ShiftInx = InVT.getScalarSizeInBits() - 1; + } } - SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, DAG.getConstant(ShiftInx, DL, InVT)); return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode); Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -30,6 +30,10 @@ // Corresponding write-mask register class. RegisterClass KRCWM = !cast("VK" # NumElts # "WM"); + // The mask VT. + ValueType KVT = !cast(!if (!eq (NumElts, 1), "i1", + "v" # NumElts # "i1")); + // The GPR register class that can hold the write mask. Use GR8 for fewer // than 8 elements. Use shift-right and equal to work around the lack of // !lt in tablegen. @@ -775,34 +779,30 @@ // A 128-bit subvector insert to the first 512-bit vector position // is a subregister copy that needs no instruction. -def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)), - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), - (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - sub_ymm)>; -def : Pat<(insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0)), - (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), - (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - sub_ymm)>; -def : Pat<(insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0)), - (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - sub_ymm)>; -def : Pat<(insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0)), - (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), - (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - sub_ymm)>; +def : Pat<(v8i64 (insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0))), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>; +def : Pat<(v8f64 (insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0))), + (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>; +def : Pat<(v16i32 (insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0))), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>; +def : Pat<(v16f32 (insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0))), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>; +def : Pat<(v32i16 (insert_subvector undef, (v8i16 VR128X:$src), (iPTR 0))), + (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>; +def : Pat<(v64i8 (insert_subvector undef, (v16i8 VR128X:$src), (iPTR 0))), + (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0)), +def : Pat<(v8i64 (insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0))), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; -def : Pat<(insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0)), +def : Pat<(v8f64 (insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0))), (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; -def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)), +def : Pat<(v16i32 (insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0))), (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; -def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), +def : Pat<(v16f32 (insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0))), (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; -def : Pat<(insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0)), +def : Pat<(v32i16 (insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0))), (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; -def : Pat<(insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0)), +def : Pat<(v64i8 (insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0))), (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; // vextractps - extract 32 bits from XMM @@ -2474,15 +2474,18 @@ } def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))), (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>; - -def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), - (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>; +def : Pat<(v8i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK32:$src, VK8))>; +def : Pat<(v8i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK64:$src, VK8))>; def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))), (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>; +def : Pat<(v16i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))), + (v16i1 (COPY_TO_REGCLASS VK64:$src, VK16))>; def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))), (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>; @@ -2507,6 +2510,9 @@ def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; +def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), + (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>; + def : Pat<(v32i1 (insert_subvector undef, VK2:$src, (iPTR 0))), (v32i1 (COPY_TO_REGCLASS VK2:$src, VK32))>; def : Pat<(v32i1 (insert_subvector undef, VK4:$src, (iPTR 0))), @@ -3831,8 +3837,22 @@ (_.ScalarLdFrag addr:$src2))))>, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; } + +// Use 512bit version to implement 128/256 bit in case NoVLX. +multiclass avx512_vptest_lowering { + def : Pat<(_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))), + (_.KVT (COPY_TO_REGCLASS + (!cast(NAME # Suffix # "Zrr") + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src1, _.SubRegIdx), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src2, _.SubRegIdx)), + _.KRC))>; +} + multiclass avx512_vptest_dq_sizes opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo _> { + AVX512VLVectorVTInfo _, string Suffix> { let Predicates = [HasAVX512] in defm Z : avx512_vptest, avx512_vptest_mb, EVEX_V512; @@ -3843,13 +3863,17 @@ defm Z128 : avx512_vptest, avx512_vptest_mb, EVEX_V128; } + let Predicates = [HasAVX512, NoVLX] in { + defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, Suffix>; + defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, Suffix>; + } } multiclass avx512_vptest_dq opc, string OpcodeStr, SDNode OpNode> { defm D : avx512_vptest_dq_sizes; + avx512vl_i32_info, "D">; defm Q : avx512_vptest_dq_sizes, VEX_W; + avx512vl_i64_info, "Q">, VEX_W; } multiclass avx512_vptest_wb opc, string OpcodeStr, @@ -3871,6 +3895,14 @@ defm BZ128: avx512_vptest, EVEX_V128; } + + let Predicates = [HasAVX512, NoVLX] in { + defm BZ256_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v32i8x_info, "B">; + defm BZ128_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v16i8x_info, "B">; + defm WZ256_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v16i16x_info, "W">; + defm WZ128_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v8i16x_info, "W">; + } + } multiclass avx512_vptest_all_forms opc_wb, bits<8> opc_dq, string OpcodeStr, @@ -6566,22 +6598,38 @@ defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; multiclass convert_vector_to_mask_common opc, X86VectorVTInfo _, string OpcodeStr > { -def rr : AVX512XS8I, EVEX; + def rr : AVX512XS8I, EVEX; +} + +// Use 512bit version to implement 128/256 bit in case NoVLX. +multiclass convert_vector_to_mask_lowering { + + def : Pat<(_.KVT (X86cvt2mask (_.VT _.RC:$src))), + (_.KVT (COPY_TO_REGCLASS + (!cast(NAME#"Zrr") + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src, _.SubRegIdx)), + _.KRC))>; } multiclass avx512_convert_vector_to_mask opc, string OpcodeStr, - AVX512VLVectorVTInfo VTInfo, Predicate prd> { -let Predicates = [prd] in - defm Z : convert_vector_to_mask_common , - EVEX_V512; + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : convert_vector_to_mask_common , + EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : convert_vector_to_mask_common, - EVEX_V256; + EVEX_V256; defm Z128 : convert_vector_to_mask_common, - EVEX_V128; + EVEX_V128; + } + let Predicates = [prd, NoVLX] in { + defm Z256_Alt : convert_vector_to_mask_lowering; + defm Z128_Alt : convert_vector_to_mask_lowering; } } Index: test/CodeGen/X86/avx512-bugfix-26264.ll =================================================================== --- test/CodeGen/X86/avx512-bugfix-26264.ll +++ test/CodeGen/X86/avx512-bugfix-26264.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw < %s | FileCheck %s --check-prefix=AVX512BW + +define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) { +; AVX512BW-LABEL: test_load_32f64: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: vmovupd 128(%rdi), %zmm3 {%k2} +; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} +; AVX512BW-NEXT: kshiftrw $8, %k2, %k1 +; AVX512BW-NEXT: vmovupd 192(%rdi), %zmm4 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovaps %zmm2, %zmm1 +; AVX512BW-NEXT: vmovaps %zmm3, %zmm2 +; AVX512BW-NEXT: vmovaps %zmm4, %zmm3 +; AVX512BW-NEXT: retq + %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) + ret <32 x double> %res +} + +define <32 x i64> @test_load_32i64(<32 x i64>* %ptrs, <32 x i1> %mask, <32 x i64> %src0) { +; AVX512BW-LABEL: test_load_32i64: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} +; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm3 {%k2} +; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} +; AVX512BW-NEXT: kshiftrw $8, %k2, %k1 +; AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm4 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovaps %zmm2, %zmm1 +; AVX512BW-NEXT: vmovaps %zmm3, %zmm2 +; AVX512BW-NEXT: vmovaps %zmm4, %zmm3 +; AVX512BW-NEXT: retq + %res = call <32 x i64> @llvm.masked.load.v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0) + ret <32 x i64> %res +} + +declare <32 x i64> @llvm.masked.load.v32i64(<32 x i64>* %ptrs, i32, <32 x i1> %mask, <32 x i64> %src0) +declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0) Index: test/CodeGen/X86/avx512-calling-conv.ll =================================================================== --- test/CodeGen/X86/avx512-calling-conv.ll +++ test/CodeGen/X86/avx512-calling-conv.ll @@ -102,11 +102,10 @@ ; ; SKX-LABEL: test4: ; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %xmm1, %xmm1 ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k0 -; SKX-NEXT: vpslld $31, %xmm1, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k1 -; SKX-NEXT: kandw %k1, %k0, %k0 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 +; SKX-NEXT: vptestmd %xmm1, %xmm1, %k0 {%k1} ; SKX-NEXT: vpmovm2d %k0, %xmm0 ; SKX-NEXT: retq ; Index: test/CodeGen/X86/avx512-ext.ll =================================================================== --- test/CodeGen/X86/avx512-ext.ll +++ test/CodeGen/X86/avx512-ext.ll @@ -314,7 +314,7 @@ ; SKX-LABEL: zext_4x8mem_to_4x32: ; SKX: ## BB#0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovzxbd (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %a = load <4 x i8>,<4 x i8> *%i,align 1 @@ -335,7 +335,7 @@ ; SKX-LABEL: sext_4x8mem_to_4x32: ; SKX: ## BB#0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %a = load <4 x i8>,<4 x i8> *%i,align 1 @@ -503,7 +503,7 @@ ; SKX-LABEL: zext_2x8mem_to_2x64: ; SKX: ## BB#0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vpmovq2m %xmm0, %k1 +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovzxbq (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %a = load <2 x i8>,<2 x i8> *%i,align 1 @@ -524,7 +524,7 @@ ; SKX-LABEL: sext_2x8mem_to_2x64mask: ; SKX: ## BB#0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vpmovq2m %xmm0, %k1 +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %a = load <2 x i8>,<2 x i8> *%i,align 1 @@ -555,7 +555,7 @@ ; SKX-LABEL: zext_4x8mem_to_4x64: ; SKX: ## BB#0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovzxbq (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq %a = load <4 x i8>,<4 x i8> *%i,align 1 @@ -577,7 +577,7 @@ ; SKX-LABEL: sext_4x8mem_to_4x64mask: ; SKX: ## BB#0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq %a = load <4 x i8>,<4 x i8> *%i,align 1 @@ -660,7 +660,7 @@ ; SKX-LABEL: zext_4x16mem_to_4x32: ; SKX: ## BB#0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovzxwd (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %a = load <4 x i16>,<4 x i16> *%i,align 1 @@ -681,7 +681,7 @@ ; SKX-LABEL: sext_4x16mem_to_4x32mask: ; SKX: ## BB#0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %a = load <4 x i16>,<4 x i16> *%i,align 1 @@ -886,7 +886,7 @@ ; SKX-LABEL: zext_2x16mem_to_2x64: ; SKX: ## BB#0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vpmovq2m %xmm0, %k1 +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovzxwq (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %a = load <2 x i16>,<2 x i16> *%i,align 1 @@ -908,7 +908,7 @@ ; SKX-LABEL: sext_2x16mem_to_2x64mask: ; SKX: ## BB#0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vpmovq2m %xmm0, %k1 +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %a = load <2 x i16>,<2 x i16> *%i,align 1 @@ -940,7 +940,7 @@ ; SKX-LABEL: zext_4x16mem_to_4x64: ; SKX: ## BB#0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovzxwq (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq %a = load <4 x i16>,<4 x i16> *%i,align 1 @@ -962,7 +962,7 @@ ; SKX-LABEL: sext_4x16mem_to_4x64mask: ; SKX: ## BB#0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq %a = load <4 x i16>,<4 x i16> *%i,align 1 @@ -1075,7 +1075,7 @@ ; SKX-LABEL: zext_2x32mem_to_2x64: ; SKX: ## BB#0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vpmovq2m %xmm0, %k1 +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovzxdq (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %a = load <2 x i32>,<2 x i32> *%i,align 1 @@ -1097,7 +1097,7 @@ ; SKX-LABEL: sext_2x32mem_to_2x64mask: ; SKX: ## BB#0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vpmovq2m %xmm0, %k1 +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %a = load <2 x i32>,<2 x i32> *%i,align 1 @@ -1129,7 +1129,7 @@ ; SKX-LABEL: zext_4x32mem_to_4x64: ; SKX: ## BB#0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovzxdq (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq %a = load <4 x i32>,<4 x i32> *%i,align 1 @@ -1151,7 +1151,7 @@ ; SKX-LABEL: sext_4x32mem_to_4x64mask: ; SKX: ## BB#0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k1 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq %a = load <4 x i32>,<4 x i32> *%i,align 1 @@ -1192,7 +1192,7 @@ ; SKX-LABEL: zext_4x32_to_4x64mask: ; SKX: ## BB#0: ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 -; SKX-NEXT: vpmovd2m %xmm1, %k1 +; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 ; SKX-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq %x = zext <4 x i32> %a to <4 x i64> @@ -1347,19 +1347,12 @@ } define i16 @trunc_16i32_to_16i1(<16 x i32> %a) { -; KNL-LABEL: trunc_16i32_to_16i1: -; KNL: ## BB#0: -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_16i32_to_16i1: -; SKX: ## BB#0: -; SKX-NEXT: vpslld $31, %zmm0, %zmm0 -; SKX-NEXT: vpmovd2m %zmm0, %k0 -; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: retq +; ALL-LABEL: trunc_16i32_to_16i1: +; ALL: ## BB#0: +; ALL-NEXT: vpslld $31, %zmm0, %zmm0 +; ALL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; ALL-NEXT: kmovw %k0, %eax +; ALL-NEXT: retq %mask_b = trunc <16 x i32>%a to <16 x i1> %mask = bitcast <16 x i1> %mask_b to i16 ret i16 %mask @@ -1376,10 +1369,9 @@ ; SKX-LABEL: trunc_4i32_to_4i1: ; SKX: ## BB#0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k0 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 ; SKX-NEXT: vpslld $31, %xmm1, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k1 -; SKX-NEXT: kandw %k1, %k0, %k0 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} ; SKX-NEXT: vpmovm2d %k0, %xmm0 ; SKX-NEXT: retq %mask_a = trunc <4 x i32>%a to <4 x i1> Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -354,7 +354,7 @@ ; SKX-NEXT: LBB17_1: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 ; SKX-NEXT: LBB17_3: -; SKX-NEXT: vpmovd2m %xmm0, %k0 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 ; SKX-NEXT: vpmovm2d %k0, %xmm0 ; SKX-NEXT: retq %mask = icmp sgt i32 %a1, %b1 @@ -1415,7 +1415,7 @@ ; SKX-LABEL: test22: ; SKX: ## BB#0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vpmovd2m %xmm0, %k0 +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) ; SKX-NEXT: retq store <4 x i1> %a, <4 x i1>* %addr @@ -1436,7 +1436,7 @@ ; SKX-LABEL: test23: ; SKX: ## BB#0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vpmovq2m %xmm0, %k0 +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 ; SKX-NEXT: kmovb %k0, (%rdi) ; SKX-NEXT: retq store <2 x i1> %a, <2 x i1>* %addr Index: test/CodeGen/X86/avx512-skx-insert-subvec.ll =================================================================== --- test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: test: ; CHECK: # BB#0: ; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 -; CHECK-NEXT: vpmovq2m %xmm0, %k0 +; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0 ; CHECK-NEXT: kshiftlb $2, %k0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: retq @@ -17,7 +17,7 @@ ; CHECK-LABEL: test1: ; CHECK: # BB#0: ; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 -; CHECK-NEXT: vpmovq2m %xmm0, %k0 +; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0 ; CHECK-NEXT: kshiftlb $4, %k0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: retq @@ -29,12 +29,12 @@ ; CHECK-LABEL: test2: ; CHECK: # BB#0: ; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 -; CHECK-NEXT: vpmovq2m %xmm0, %k0 +; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0 ; CHECK-NEXT: vpmovm2q %k0, %zmm0 ; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] ; CHECK-NEXT: vpsllq $63, %zmm0, %zmm0 -; CHECK-NEXT: vpmovq2m %zmm0, %k0 +; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: retq %res = shufflevector <2 x i1> %a, <2 x i1> zeroinitializer, <8 x i32> @@ -45,7 +45,7 @@ ; CHECK-LABEL: test3: ; CHECK: # BB#0: ; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-NEXT: vpmovd2m %xmm0, %k0 +; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0 ; CHECK-NEXT: kshiftlb $4, %k0, %k0 ; CHECK-NEXT: kshiftrb $4, %k0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 @@ -59,7 +59,7 @@ ; CHECK-LABEL: test4: ; CHECK: # BB#0: ; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-NEXT: vpmovd2m %xmm0, %k0 +; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0 ; CHECK-NEXT: kshiftlb $4, %k0, %k0 ; CHECK-NEXT: kshiftrb $4, %k0, %k1 ; CHECK-NEXT: korb %k0, %k1, %k0 @@ -74,7 +74,7 @@ ; CHECK-LABEL: test5: ; CHECK: # BB#0: ; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 -; CHECK-NEXT: vpmovq2m %xmm0, %k0 +; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0 ; CHECK-NEXT: kshiftlw $2, %k0, %k0 ; CHECK-NEXT: kshiftrw $2, %k0, %k1 ; CHECK-NEXT: korw %k0, %k1, %k0 @@ -89,7 +89,7 @@ ; CHECK-LABEL: test6: ; CHECK: # BB#0: ; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 -; CHECK-NEXT: vpmovq2m %xmm0, %k0 +; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0 ; CHECK-NEXT: kshiftlw $2, %k0, %k0 ; CHECK-NEXT: kshiftrw $2, %k0, %k1 ; CHECK-NEXT: korw %k0, %k1, %k0 @@ -105,7 +105,7 @@ ; CHECK-LABEL: test7: ; CHECK: # BB#0: ; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-NEXT: vpmovd2m %xmm0, %k0 +; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0 ; CHECK-NEXT: kshiftlb $4, %k0, %k0 ; CHECK-NEXT: kshiftrb $4, %k0, %k1 ; CHECK-NEXT: korb %k0, %k1, %k0 Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -679,9 +679,8 @@ ; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2 -; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm0 -; KNL_64-NEXT: vpsllq $63, %zmm0, %zmm0 -; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL_64-NEXT: vpslld $31, %ymm1, %ymm0 +; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1} ; KNL_64-NEXT: retq ; @@ -691,16 +690,15 @@ ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2 -; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm0 -; KNL_32-NEXT: vpsllvq .LCPI14_0, %zmm0, %zmm0 -; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL_32-NEXT: vpslld $31, %ymm1, %ymm0 +; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test15: ; SKX: # BB#0: ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 -; SKX-NEXT: vpmovd2m %xmm1, %k1 +; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq @@ -708,7 +706,7 @@ ; SKX_32-LABEL: test15: ; SKX_32: # BB#0: ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 -; SKX_32-NEXT: vpmovd2m %xmm1, %k1 +; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1} ; SKX_32-NEXT: vmovaps %zmm1, %zmm0 @@ -755,7 +753,7 @@ ; SKX-LABEL: test16: ; SKX: # BB#0: ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 -; SKX-NEXT: vpmovd2m %xmm1, %k1 +; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1} ; SKX-NEXT: vmovaps %zmm2, %zmm0 ; SKX-NEXT: retq @@ -763,7 +761,7 @@ ; SKX_32-LABEL: test16: ; SKX_32: # BB#0: ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 -; SKX_32-NEXT: vpmovd2m %xmm1, %k1 +; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1} ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 @@ -801,7 +799,7 @@ ; SKX-LABEL: test17: ; SKX: # BB#0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX-NEXT: vpmovq2m %xmm1, %k1 +; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1} ; SKX-NEXT: vmovaps %zmm2, %zmm0 ; SKX-NEXT: retq @@ -809,7 +807,7 @@ ; SKX_32-LABEL: test17: ; SKX_32: # BB#0: ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX_32-NEXT: vpmovq2m %xmm1, %k1 +; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1} ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 @@ -833,9 +831,8 @@ ; KNL_64: # BB#0: ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2 -; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2 -; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2 +; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; KNL_64-NEXT: retq ; @@ -844,23 +841,22 @@ ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2 -; KNL_32-NEXT: vpsllvq .LCPI17_0, %zmm2, %zmm2 -; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2 +; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test18: ; SKX: # BB#0: ; SKX-NEXT: vpslld $31, %xmm2, %xmm2 -; SKX-NEXT: vpmovd2m %xmm2, %k1 +; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: test18: ; SKX_32: # BB#0: ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2 -; SKX_32-NEXT: vpmovd2m %xmm2, %k1 +; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) @@ -897,14 +893,14 @@ ; SKX-LABEL: test19: ; SKX: # BB#0: ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 -; SKX-NEXT: vpmovd2m %xmm1, %k1 +; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: test19: ; SKX_32: # BB#0: ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 -; SKX_32-NEXT: vpmovd2m %xmm1, %k1 +; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1} ; SKX_32-NEXT: retl @@ -922,9 +918,8 @@ ; KNL_64-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2 -; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2 -; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2 +; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} ; KNL_64-NEXT: retq ; @@ -936,16 +931,15 @@ ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2 -; KNL_32-NEXT: vpsllvq .LCPI19_0, %zmm2, %zmm2 -; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2 +; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test20: ; SKX: # BB#0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; SKX-NEXT: vpmovq2m %xmm2, %k0 +; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0 ; SKX-NEXT: kshiftlw $2, %k0, %k0 ; SKX-NEXT: kshiftrw $2, %k0, %k1 ; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1} @@ -955,7 +949,7 @@ ; SKX_32: # BB#0: ; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 -; SKX_32-NEXT: vpmovq2m %xmm2, %k0 +; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0 ; SKX_32-NEXT: kshiftlw $2, %k0, %k0 ; SKX_32-NEXT: kshiftrw $2, %k0, %k1 ; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1} @@ -990,7 +984,7 @@ ; SKX-LABEL: test21: ; SKX: # BB#0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; SKX-NEXT: vpmovq2m %xmm2, %k0 +; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0 ; SKX-NEXT: kshiftlw $2, %k0, %k0 ; SKX-NEXT: kshiftrw $2, %k0, %k1 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1000,7 +994,7 @@ ; SKX_32-LABEL: test21: ; SKX_32: # BB#0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 -; SKX_32-NEXT: vpmovq2m %xmm2, %k0 +; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0 ; SKX_32-NEXT: kshiftlw $2, %k0, %k0 ; SKX_32-NEXT: kshiftrw $2, %k0, %k1 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1024,9 +1018,8 @@ ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 -; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 -; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1 +; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1} ; KNL_64-NEXT: vmovaps %zmm2, %zmm0 ; KNL_64-NEXT: retq @@ -1040,9 +1033,8 @@ ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 -; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_32-NEXT: vpsllvq .LCPI21_0, %zmm1, %zmm1 -; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1 +; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1} ; KNL_32-NEXT: vmovaps %zmm2, %zmm0 ; KNL_32-NEXT: retl @@ -1051,7 +1043,7 @@ ; SKX: # BB#0: ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX-NEXT: vpmovq2m %xmm1, %k0 +; SKX-NEXT: vptestmq %xmm1, %xmm1, %k0 ; SKX-NEXT: kshiftlw $2, %k0, %k0 ; SKX-NEXT: kshiftrw $2, %k0, %k1 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1} @@ -1062,7 +1054,7 @@ ; SKX_32: # BB#0: ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX_32-NEXT: vpmovq2m %xmm1, %k0 +; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k0 ; SKX_32-NEXT: kshiftlw $2, %k0, %k0 ; SKX_32-NEXT: kshiftrw $2, %k0, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1104,7 +1096,7 @@ ; SKX-LABEL: test23: ; SKX: # BB#0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX-NEXT: vpmovq2m %xmm1, %k1 +; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} ; SKX-NEXT: vmovaps %zmm2, %zmm0 ; SKX-NEXT: retq @@ -1112,7 +1104,7 @@ ; SKX_32-LABEL: test23: ; SKX_32: # BB#0: ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX_32-NEXT: vpmovq2m %xmm1, %k1 +; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1} ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 @@ -1189,7 +1181,7 @@ ; SKX-LABEL: test25: ; SKX: # BB#0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX-NEXT: vpmovq2m %xmm1, %k1 +; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} ; SKX-NEXT: vmovaps %zmm2, %zmm0 ; SKX-NEXT: retq @@ -1197,7 +1189,7 @@ ; SKX_32-LABEL: test25: ; SKX_32: # BB#0: ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX_32-NEXT: vpmovq2m %xmm1, %k1 +; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1} ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 @@ -1468,7 +1460,7 @@ ; SKX-LABEL: test30: ; SKX: # BB#0: ; SKX-NEXT: vpslld $31, %xmm2, %xmm2 -; SKX-NEXT: vpmovd2m %xmm2, %k1 +; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 @@ -1508,7 +1500,7 @@ ; SKX_32-NEXT: .Ltmp0: ; SKX_32-NEXT: .cfi_def_cfa_offset 16 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2 -; SKX_32-NEXT: vpmovd2m %xmm2, %k1 +; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1 ; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp) ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 ; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm2 Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -13,11 +13,11 @@ ; VL_BW_DQ-LABEL: shuf2i1_1_0: ; VL_BW_DQ: # BB#0: ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 -; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 -; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 ; VL_BW_DQ-NEXT: retq %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> @@ -35,14 +35,14 @@ ; VL_BW_DQ-LABEL: shuf2i1_1_2: ; VL_BW_DQ: # BB#0: ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 -; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 ; VL_BW_DQ-NEXT: movb $1, %al ; VL_BW_DQ-NEXT: kmovb %eax, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1 ; VL_BW_DQ-NEXT: vpalignr $8, %xmm0, %xmm1, %xmm0 ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 -; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 ; VL_BW_DQ-NEXT: retq %b = shufflevector <2 x i1> %a, <2 x i1> , <2 x i32> @@ -59,11 +59,11 @@ ; VL_BW_DQ-LABEL: shuf4i1_3_2_10: ; VL_BW_DQ: # BB#0: ; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0 -; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] ; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0 -; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 +; VL_BW_DQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 ; VL_BW_DQ-NEXT: retq %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> @@ -91,7 +91,7 @@ ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] ; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 ; VL_BW_DQ-NEXT: retq %a2 = icmp eq <8 x i64> %a, %a1 @@ -125,7 +125,7 @@ ; VL_BW_DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; VL_BW_DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; VL_BW_DQ-NEXT: vpslld $31, %zmm1, %zmm0 -; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 ; VL_BW_DQ-NEXT: retq %a2 = icmp eq <16 x i32> %a, %a1 @@ -180,7 +180,7 @@ ; VL_BW_DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm0 ; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %zmm0 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -209,7 +209,7 @@ ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> ; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -235,7 +235,7 @@ ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -265,7 +265,7 @@ ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] ; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -295,7 +295,7 @@ ; VL_BW_DQ-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -330,7 +330,7 @@ ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] ; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> @@ -363,7 +363,7 @@ ; VL_BW_DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax ; VL_BW_DQ-NEXT: retq %c = shufflevector <8 x i1> , <8 x i1> %a, <8 x i32> @@ -389,7 +389,7 @@ ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0 ; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0 ; VL_BW_DQ-NEXT: vpslld $31, %zmm0, %zmm0 -; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovw %k0, %eax ; VL_BW_DQ-NEXT: retq %b = bitcast i16 %a to <16 x i1>