diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37619,14 +37619,26 @@ // movmskb even with avx512. This will be better than truncating to vXi1 and // using a kmov. This can especially help KNL if the input is a v16i8/v32i8 // vpcmpeqb/vpcmpgtb. - bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() && - (Src.getOperand(0).getValueType() == MVT::v16i8 || - Src.getOperand(0).getValueType() == MVT::v32i8 || - Src.getOperand(0).getValueType() == MVT::v64i8); + bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() && + (Src.getOperand(0).getValueType() == MVT::v16i8 || + Src.getOperand(0).getValueType() == MVT::v32i8 || + Src.getOperand(0).getValueType() == MVT::v64i8); + + // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled + // directly with vpmovmskb/vmovmskps/vmovmskpd. + if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() && + cast(Src.getOperand(2))->get() == ISD::SETLT && + ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) { + EVT CmpVT = Src.getOperand(0).getValueType(); + EVT EltVT = CmpVT.getVectorElementType(); + if (CmpVT.getSizeInBits() <= 256 && + (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64)) + PreferMovMsk = true; + } // With AVX512 vxi1 types are legal and we prefer using k-regs. // MOVMSK is supported in SSE2 or later. - if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !IsTruncated)) + if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk)) return SDValue(); // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and @@ -38042,6 +38054,47 @@ return DAG.getConstant(0, SDLoc(N0), VT); } + // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1. + // Turn it into a sign bit compare that produces a k-register. This avoids + // a trip through a GPR. + if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && + VT.isVector() && VT.getVectorElementType() == MVT::i1 && + isPowerOf2_32(VT.getVectorNumElements())) { + unsigned NumElts = VT.getVectorNumElements(); + SDValue Src = N0; + + // Peek through truncate. + if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) + Src = N0.getOperand(0); + + if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) { + SDValue MovmskIn = Src.getOperand(0); + MVT MovmskVT = MovmskIn.getSimpleValueType(); + unsigned MovMskElts = MovmskVT.getVectorNumElements(); + + // We allow extra bits of the movmsk to be used since they are known zero. + // We can't convert a VPMOVMSKB without avx512bw. + if (MovMskElts <= NumElts && + (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) { + EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger(); + MovmskIn = DAG.getBitcast(IntVT, MovmskIn); + SDLoc dl(N); + MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts); + SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn, + DAG.getConstant(0, dl, IntVT), ISD::SETLT); + if (EVT(CmpVT) == VT) + return Cmp; + + // Pad with zeroes up to original VT to replace the zeroes that were + // being used from the MOVMSK. + unsigned NumConcats = NumElts / MovMskElts; + SmallVector Ops(NumConcats, DAG.getConstant(0, dl, CmpVT)); + Ops[0] = Cmp; + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops); + } + } + } + // Try to remove bitcasts from input and output of mask arithmetic to // remove GPR<->K-register crossings. if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget)) diff --git a/llvm/test/CodeGen/X86/avx512-movmsk.ll b/llvm/test/CodeGen/X86/avx512-movmsk.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-movmsk.ll @@ -0,0 +1,167 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl | FileCheck %s --check-prefixes=AVX512VL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512dq,avx512bw | FileCheck %s --check-prefixes=AVX512VLDQBW + +; This test makes sure we don't use movmsk instructions when masked compares +; would be better. The use of the getmant intrinsic introduces a convertion +; scalar to vXi1 late after movmsk has been formed. Requiring it to be reversed. + +declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2 x double>, i8) +declare <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double>, i32, <4 x double>, i8) +declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32) +declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32) + +define <2 x double> @movmsk2(<2 x double> %x0, <2 x double> %x2, <2 x i64> %mask) { +; AVX512VL-LABEL: movmsk2: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpgtq %xmm2, %xmm3, %k1 +; AVX512VL-NEXT: vgetmantpd $11, %xmm0, %xmm1 {%k1} +; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512VLDQBW-LABEL: movmsk2: +; AVX512VLDQBW: ## %bb.0: +; AVX512VLDQBW-NEXT: vpmovq2m %xmm2, %k1 +; AVX512VLDQBW-NEXT: vgetmantpd $11, %xmm0, %xmm1 {%k1} +; AVX512VLDQBW-NEXT: vmovapd %xmm1, %xmm0 +; AVX512VLDQBW-NEXT: retq + %a = icmp slt <2 x i64> %mask, zeroinitializer + %b = bitcast <2 x i1> %a to i2 + %c = zext i2 %b to i8 + %res = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 %c) + ret <2 x double> %res +} + +define <4 x double> @movmsk4(<4 x double> %x0, <4 x double> %x2, <4 x i32> %mask) { +; AVX512VL-LABEL: movmsk4: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpgtd %xmm2, %xmm3, %k1 +; AVX512VL-NEXT: vgetmantpd $11, %ymm0, %ymm1 {%k1} +; AVX512VL-NEXT: vmovapd %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLDQBW-LABEL: movmsk4: +; AVX512VLDQBW: ## %bb.0: +; AVX512VLDQBW-NEXT: vpmovd2m %xmm2, %k1 +; AVX512VLDQBW-NEXT: vgetmantpd $11, %ymm0, %ymm1 {%k1} +; AVX512VLDQBW-NEXT: vmovapd %ymm1, %ymm0 +; AVX512VLDQBW-NEXT: retq + %a = icmp slt <4 x i32> %mask, zeroinitializer + %b = bitcast <4 x i1> %a to i4 + %c = zext i4 %b to i8 + %res = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 %c) + ret <4 x double> %res +} + +define <8 x double> @movmsk8(<8 x double> %x0, <8 x double> %x2, <8 x i32> %mask) { +; AVX512VL-LABEL: movmsk8: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpgtd %ymm2, %ymm3, %k1 +; AVX512VL-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1} +; AVX512VL-NEXT: vmovapd %zmm1, %zmm0 +; AVX512VL-NEXT: retq +; +; AVX512VLDQBW-LABEL: movmsk8: +; AVX512VLDQBW: ## %bb.0: +; AVX512VLDQBW-NEXT: vpmovd2m %ymm2, %k1 +; AVX512VLDQBW-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1} +; AVX512VLDQBW-NEXT: vmovapd %zmm1, %zmm0 +; AVX512VLDQBW-NEXT: retq + %a = icmp slt <8 x i32> %mask, zeroinitializer + %b = bitcast <8 x i1> %a to i8 + %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %b, i32 4) + ret <8 x double> %res +} + +define <16 x float> @movmsk16(<16 x float> %x0, <16 x float> %x2, <16 x i8> %mask) { +; AVX512VL-LABEL: movmsk16: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpmovmskb %xmm2, %eax +; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: vgetmantps $11, %zmm0, %zmm1 {%k1} +; AVX512VL-NEXT: vmovaps %zmm1, %zmm0 +; AVX512VL-NEXT: retq +; +; AVX512VLDQBW-LABEL: movmsk16: +; AVX512VLDQBW: ## %bb.0: +; AVX512VLDQBW-NEXT: vpmovb2m %xmm2, %k1 +; AVX512VLDQBW-NEXT: vgetmantps $11, %zmm0, %zmm1 {%k1} +; AVX512VLDQBW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512VLDQBW-NEXT: retq + %a = icmp slt <16 x i8> %mask, zeroinitializer + %b = bitcast <16 x i1> %a to i16 + %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %b, i32 4) + ret <16 x float> %res +} + +; Similar to above but with fp types bitcasted to int for the slt. +define <2 x double> @movmsk2_fp(<2 x double> %x0, <2 x double> %x2, <2 x double> %mask) { +; AVX512VL-LABEL: movmsk2_fp: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpgtq %xmm2, %xmm3, %k1 +; AVX512VL-NEXT: vgetmantpd $11, %xmm0, %xmm1 {%k1} +; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512VLDQBW-LABEL: movmsk2_fp: +; AVX512VLDQBW: ## %bb.0: +; AVX512VLDQBW-NEXT: vpmovq2m %xmm2, %k1 +; AVX512VLDQBW-NEXT: vgetmantpd $11, %xmm0, %xmm1 {%k1} +; AVX512VLDQBW-NEXT: vmovapd %xmm1, %xmm0 +; AVX512VLDQBW-NEXT: retq + %q = bitcast <2 x double> %mask to <2 x i64> + %a = icmp slt <2 x i64> %q, zeroinitializer + %b = bitcast <2 x i1> %a to i2 + %c = zext i2 %b to i8 + %res = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 %c) + ret <2 x double> %res +} + +define <4 x double> @movmsk4_fp(<4 x double> %x0, <4 x double> %x2, <4 x float> %mask) { +; AVX512VL-LABEL: movmsk4_fp: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpgtd %xmm2, %xmm3, %k1 +; AVX512VL-NEXT: vgetmantpd $11, %ymm0, %ymm1 {%k1} +; AVX512VL-NEXT: vmovapd %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLDQBW-LABEL: movmsk4_fp: +; AVX512VLDQBW: ## %bb.0: +; AVX512VLDQBW-NEXT: vpmovd2m %xmm2, %k1 +; AVX512VLDQBW-NEXT: vgetmantpd $11, %ymm0, %ymm1 {%k1} +; AVX512VLDQBW-NEXT: vmovapd %ymm1, %ymm0 +; AVX512VLDQBW-NEXT: retq + %q = bitcast <4 x float> %mask to <4 x i32> + %a = icmp slt <4 x i32> %q, zeroinitializer + %b = bitcast <4 x i1> %a to i4 + %c = zext i4 %b to i8 + %res = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 %c) + ret <4 x double> %res +} + +define <8 x double> @movmsk8_fp(<8 x double> %x0, <8 x double> %x2, <8 x float> %mask) { +; AVX512VL-LABEL: movmsk8_fp: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpgtd %ymm2, %ymm3, %k1 +; AVX512VL-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1} +; AVX512VL-NEXT: vmovapd %zmm1, %zmm0 +; AVX512VL-NEXT: retq +; +; AVX512VLDQBW-LABEL: movmsk8_fp: +; AVX512VLDQBW: ## %bb.0: +; AVX512VLDQBW-NEXT: vpmovd2m %ymm2, %k1 +; AVX512VLDQBW-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1} +; AVX512VLDQBW-NEXT: vmovapd %zmm1, %zmm0 +; AVX512VLDQBW-NEXT: retq + %q = bitcast <8 x float> %mask to <8 x i32> + %a = icmp slt <8 x i32> %q, zeroinitializer + %b = bitcast <8 x i1> %a to i8 + %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %b, i32 4) + ret <8 x double> %res +} diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -6404,8 +6404,7 @@ define i16@test_int_x86_avx512_cvtb2mask_128(<16 x i8> %x0) { ; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovb2m %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x29,0xc0] -; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpmovmskb %xmm0, %eax # encoding: [0xc5,0xf9,0xd7,0xc0] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8> %x0) @@ -6417,8 +6416,7 @@ define i32@test_int_x86_avx512_cvtb2mask_256(<32 x i8> %x0) { ; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovb2m %ymm0, %k0 # encoding: [0x62,0xf2,0x7e,0x28,0x29,0xc0] -; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpmovmskb %ymm0, %eax # encoding: [0xc5,0xfd,0xd7,0xc0] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8> %x0) diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -2764,8 +2764,7 @@ define i8@test_int_x86_avx512_cvtd2mask_256(<8 x i32> %x0) { ; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovd2m %ymm0, %k0 # encoding: [0x62,0xf2,0x7e,0x28,0x39,0xc0] -; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vmovmskps %ymm0, %eax # encoding: [0xc5,0xfc,0x50,0xc0] ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-setcc-256.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-256.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-256.ll @@ -420,23 +420,12 @@ ; AVX12-NEXT: vzeroupper ; AVX12-NEXT: retq ; -; AVX512F-LABEL: bitcast_8i32_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: bitcast_8i32_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: bitcast_8i32_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovmskps %ymm0, %eax +; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a1 = icmp slt <8 x i32> %a0, zeroinitializer %a2 = bitcast <8 x i1> %a1 to i8 store i8 %a2, i8* %p diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -67,9 +67,7 @@ ; ; AVX512-LABEL: bitcast_v4i32_to_v2i2: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %k0 -; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: vmovmskps %xmm0, %eax ; AVX512-NEXT: movl %eax, %ecx ; AVX512-NEXT: shrb $2, %cl ; AVX512-NEXT: andb $3, %al @@ -146,11 +144,9 @@ ; ; AVX512-LABEL: bitcast_v16i8_to_v2i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovb2m %xmm0, %k0 -; AVX512-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 -; AVX512-NEXT: vmovd %xmm0, %ecx -; AVX512-NEXT: vpextrb $1, %xmm0, %eax +; AVX512-NEXT: vpmovmskb %xmm0, %ecx +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: shrl $8, %eax ; AVX512-NEXT: addb %cl, %al ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -191,9 +187,7 @@ ; ; AVX512-LABEL: bitcast_v4i64_to_v2i2: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpgtq %ymm0, %ymm1, %k0 -; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: vmovmskpd %ymm0, %eax ; AVX512-NEXT: movl %eax, %ecx ; AVX512-NEXT: shrb $2, %cl ; AVX512-NEXT: andb $3, %al @@ -235,9 +229,7 @@ ; ; AVX512-LABEL: bitcast_v8i32_to_v2i4: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 -; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: vmovmskps %ymm0, %eax ; AVX512-NEXT: movl %eax, %ecx ; AVX512-NEXT: shrb $4, %cl ; AVX512-NEXT: andb $15, %al @@ -338,19 +330,11 @@ ; ; AVX512-LABEL: bitcast_v32i8_to_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-32, %rsp -; AVX512-NEXT: subq $32, %rsp -; AVX512-NEXT: vpmovb2m %ymm0, %k0 -; AVX512-NEXT: kmovd %k0, (%rsp) -; AVX512-NEXT: vmovdqa (%rsp), %xmm0 -; AVX512-NEXT: vmovd %xmm0, %ecx -; AVX512-NEXT: vpextrw $1, %xmm0, %eax +; AVX512-NEXT: vpmovmskb %ymm0, %ecx +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: shrl $16, %eax ; AVX512-NEXT: addl %ecx, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp slt <32 x i8> %a0, zeroinitializer diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -29,9 +29,9 @@ ; ; SKX-LABEL: allones_v16i8_sign: ; SKX: # %bb.0: -; SKX-NEXT: vpmovb2m %xmm0, %k0 -; SKX-NEXT: kortestw %k0, %k0 -; SKX-NEXT: setb %al +; SKX-NEXT: vpmovmskb %xmm0, %eax +; SKX-NEXT: cmpw $-1, %ax +; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = icmp slt <16 x i8> %arg, zeroinitializer %tmp1 = bitcast <16 x i1> %tmp to i16 @@ -63,8 +63,8 @@ ; ; SKX-LABEL: allzeros_v16i8_sign: ; SKX: # %bb.0: -; SKX-NEXT: vpmovb2m %xmm0, %k0 -; SKX-NEXT: kortestw %k0, %k0 +; SKX-NEXT: vpmovmskb %xmm0, %eax +; SKX-NEXT: testw %ax, %ax ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = icmp slt <16 x i8> %arg, zeroinitializer @@ -114,9 +114,9 @@ ; ; SKX-LABEL: allones_v32i8_sign: ; SKX: # %bb.0: -; SKX-NEXT: vpmovb2m %ymm0, %k0 -; SKX-NEXT: kortestd %k0, %k0 -; SKX-NEXT: setb %al +; SKX-NEXT: vpmovmskb %ymm0, %eax +; SKX-NEXT: cmpl $-1, %eax +; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %tmp = icmp slt <32 x i8> %arg, zeroinitializer @@ -164,8 +164,8 @@ ; ; SKX-LABEL: allzeros_v32i8_sign: ; SKX: # %bb.0: -; SKX-NEXT: vpmovb2m %ymm0, %k0 -; SKX-NEXT: kortestd %k0, %k0 +; SKX-NEXT: vpmovmskb %ymm0, %eax +; SKX-NEXT: testl %eax, %eax ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -653,20 +653,14 @@ ; ; KNL-LABEL: allones_v4i32_sign: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andb $15, %al +; KNL-NEXT: vmovmskps %xmm0, %eax ; KNL-NEXT: cmpb $15, %al ; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allones_v4i32_sign: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: vmovmskps %xmm0, %eax ; SKX-NEXT: cmpb $15, %al ; SKX-NEXT: sete %al ; SKX-NEXT: retq @@ -693,19 +687,15 @@ ; ; KNL-LABEL: allzeros_v4i32_sign: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb $15, %al +; KNL-NEXT: vmovmskps %xmm0, %eax +; KNL-NEXT: testb %al, %al ; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allzeros_v4i32_sign: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %xmm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 +; SKX-NEXT: vmovmskps %xmm0, %eax +; SKX-NEXT: testb %al, %al ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = icmp slt <4 x i32> %arg, zeroinitializer @@ -734,10 +724,7 @@ ; ; KNL-LABEL: allones_v8i32_sign: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vmovmskps %ymm0, %eax ; KNL-NEXT: cmpb $-1, %al ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper @@ -745,9 +732,9 @@ ; ; SKX-LABEL: allones_v8i32_sign: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %ymm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 -; SKX-NEXT: setb %al +; SKX-NEXT: vmovmskps %ymm0, %eax +; SKX-NEXT: cmpb $-1, %al +; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %tmp = icmp slt <8 x i32> %arg, zeroinitializer @@ -775,10 +762,7 @@ ; ; KNL-LABEL: allzeros_v8i32_sign: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vmovmskps %ymm0, %eax ; KNL-NEXT: testb %al, %al ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper @@ -786,8 +770,8 @@ ; ; SKX-LABEL: allzeros_v8i32_sign: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %ymm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 +; SKX-NEXT: vmovmskps %ymm0, %eax +; SKX-NEXT: testb %al, %al ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -936,11 +920,7 @@ ; ; KNL-LABEL: allones_v4i64_sign: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andb $15, %al +; KNL-NEXT: vmovmskpd %ymm0, %eax ; KNL-NEXT: cmpb $15, %al ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper @@ -948,8 +928,7 @@ ; ; SKX-LABEL: allones_v4i64_sign: ; SKX: # %bb.0: -; SKX-NEXT: vpmovq2m %ymm0, %k0 -; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: vmovmskpd %ymm0, %eax ; SKX-NEXT: cmpb $15, %al ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -979,19 +958,16 @@ ; ; KNL-LABEL: allzeros_v4i64_sign: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb $15, %al +; KNL-NEXT: vmovmskpd %ymm0, %eax +; KNL-NEXT: testb %al, %al ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allzeros_v4i64_sign: ; SKX: # %bb.0: -; SKX-NEXT: vpmovq2m %ymm0, %k0 -; SKX-NEXT: kortestb %k0, %k0 +; SKX-NEXT: vmovmskpd %ymm0, %eax +; SKX-NEXT: testb %al, %al ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -3924,19 +3900,12 @@ ; ; KNL-LABEL: movmskpd: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $3, %eax -; KNL-NEXT: vzeroupper +; KNL-NEXT: vmovmskpd %xmm0, %eax ; KNL-NEXT: retq ; ; SKX-LABEL: movmskpd: ; SKX: # %bb.0: -; SKX-NEXT: vpmovq2m %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $3, %eax +; SKX-NEXT: vmovmskpd %xmm0, %eax ; SKX-NEXT: retq %a = bitcast <2 x double> %x to <2 x i64> %b = icmp slt <2 x i64> %a, zeroinitializer @@ -3958,19 +3927,12 @@ ; ; KNL-LABEL: movmskps: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $15, %eax -; KNL-NEXT: vzeroupper +; KNL-NEXT: vmovmskps %xmm0, %eax ; KNL-NEXT: retq ; ; SKX-LABEL: movmskps: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $15, %eax +; SKX-NEXT: vmovmskps %xmm0, %eax ; SKX-NEXT: retq %a = bitcast <4 x float> %x to <4 x i32> %b = icmp slt <4 x i32> %a, zeroinitializer @@ -3994,19 +3956,13 @@ ; ; KNL-LABEL: movmskpd256: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $15, %eax +; KNL-NEXT: vmovmskpd %ymm0, %eax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: movmskpd256: ; SKX: # %bb.0: -; SKX-NEXT: vpmovq2m %ymm0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $15, %eax +; SKX-NEXT: vmovmskpd %ymm0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %a = bitcast <4 x double> %x to <4 x i64> @@ -4033,18 +3989,13 @@ ; ; KNL-LABEL: movmskps256: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: vmovmskps %ymm0, %eax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: movmskps256: ; SKX: # %bb.0: -; SKX-NEXT: vpmovd2m %ymm0, %k0 -; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: vmovmskps %ymm0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %a = bitcast <8 x float> %x to <8 x i32> @@ -4072,8 +4023,7 @@ ; ; SKX-LABEL: movmskb: ; SKX: # %bb.0: -; SKX-NEXT: vpmovb2m %xmm0, %k0 -; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: vpmovmskb %xmm0, %eax ; SKX-NEXT: retq %a = icmp slt <16 x i8> %x, zeroinitializer %b = bitcast <16 x i1> %a to i16 @@ -4114,8 +4064,7 @@ ; ; SKX-LABEL: movmskb256: ; SKX: # %bb.0: -; SKX-NEXT: vpmovb2m %ymm0, %k0 -; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: vpmovmskb %ymm0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %a = icmp slt <32 x i8> %x, zeroinitializer