diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -422,6 +422,9 @@ //===----------------------------------------------------------------------===// // X86 Subtarget Tuning features //===----------------------------------------------------------------------===// +def TuningPreferMovmskOverVTest : SubtargetFeature<"prefer-movmsk-over-vtest", + "PreferMovmskOverVTest", "true", + "Prefer movmsk over vtest instruction">; def TuningSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; @@ -1166,7 +1169,8 @@ FeatureMOVDIRI, FeatureMOVDIR64B, FeatureWAITPKG]; - list ADLAdditionalTuning = [TuningPERMFalseDeps]; + list ADLAdditionalTuning = [TuningPERMFalseDeps, + TuningPreferMovmskOverVTest]; list ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning); list ADLFeatures = !listconcat(TRMFeatures, ADLAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48024,7 +48024,8 @@ // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V) // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V) // iff every element is referenced. - if (NumElts <= CmpBits && Subtarget.hasAVX() && IsOneUse && + if (NumElts <= CmpBits && Subtarget.hasAVX() && + !Subtarget.preferMovmskOverVTest() && IsOneUse && (NumEltBits == 32 || NumEltBits == 64)) { SDLoc DL(EFLAGS); MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits); diff --git a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll --- a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll +++ b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s --check-prefixes=ADL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+prefer-movmsk-over-vtest | FileCheck %s --check-prefixes=ADL declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) @@ -22,7 +22,8 @@ ; ADL: # %bb.0: ; ADL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 -; ADL-NEXT: vtestpd %ymm0, %ymm0 +; ADL-NEXT: vmovmskpd %ymm0, %eax +; ADL-NEXT: testl %eax, %eax ; ADL-NEXT: sete %al ; ADL-NEXT: vzeroupper ; ADL-NEXT: retq @@ -59,9 +60,9 @@ ; ADL: # %bb.0: ; ADL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 -; ADL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; ADL-NEXT: vtestpd %ymm1, %ymm0 -; ADL-NEXT: setb %al +; ADL-NEXT: vmovmskpd %ymm0, %eax +; ADL-NEXT: cmpl $15, %eax +; ADL-NEXT: sete %al ; ADL-NEXT: vzeroupper ; ADL-NEXT: retq %1 = fcmp oeq <4 x double> %a0, zeroinitializer @@ -203,10 +204,10 @@ ; ADL-LABEL: movmskps_concat_v4f32: ; ADL: # %bb.0: ; ADL-NEXT: vorps %xmm1, %xmm0, %xmm0 +; ADL-NEXT: vmovmskps %xmm0, %ecx ; ADL-NEXT: xorl %eax, %eax -; ADL-NEXT: vtestps %xmm0, %xmm0 -; ADL-NEXT: setne %al -; ADL-NEXT: negl %eax +; ADL-NEXT: negl %ecx +; ADL-NEXT: sbbl %eax, %eax ; ADL-NEXT: retq %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> %2 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %1) diff --git a/llvm/test/CodeGen/X86/combine-movmsk.ll b/llvm/test/CodeGen/X86/combine-movmsk.ll --- a/llvm/test/CodeGen/X86/combine-movmsk.ll +++ b/llvm/test/CodeGen/X86/combine-movmsk.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s --check-prefixes=ADL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+prefer-movmsk-over-vtest | FileCheck %s --check-prefixes=ADL declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) @@ -33,7 +33,8 @@ ; ADL: # %bb.0: ; ADL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqpd %xmm0, %xmm1, %xmm0 -; ADL-NEXT: vtestpd %xmm0, %xmm0 +; ADL-NEXT: vmovmskpd %xmm0, %eax +; ADL-NEXT: testl %eax, %eax ; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = fcmp oeq <2 x double> zeroinitializer, %a0 @@ -67,9 +68,9 @@ ; ADL: # %bb.0: ; ADL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqpd %xmm0, %xmm1, %xmm0 -; ADL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; ADL-NEXT: vtestpd %xmm1, %xmm0 -; ADL-NEXT: setb %al +; ADL-NEXT: vmovmskpd %xmm0, %eax +; ADL-NEXT: cmpl $3, %eax +; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = fcmp oeq <2 x double> zeroinitializer, %a0 %2 = sext <2 x i1> %1 to <2 x i64> @@ -103,7 +104,8 @@ ; ; ADL-LABEL: pmovmskb_noneof_bitcast_v2i64: ; ADL: # %bb.0: -; ADL-NEXT: vtestpd %xmm0, %xmm0 +; ADL-NEXT: vmovmskpd %xmm0, %eax +; ADL-NEXT: testl %eax, %eax ; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = icmp sgt <2 x i64> zeroinitializer, %a0 @@ -139,9 +141,9 @@ ; ; ADL-LABEL: pmovmskb_allof_bitcast_v2i64: ; ADL: # %bb.0: -; ADL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; ADL-NEXT: vtestpd %xmm1, %xmm0 -; ADL-NEXT: setb %al +; ADL-NEXT: vmovmskpd %xmm0, %eax +; ADL-NEXT: cmpl $3, %eax +; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = icmp sgt <2 x i64> zeroinitializer, %a0 %2 = sext <2 x i1> %1 to <2 x i64> @@ -173,7 +175,8 @@ ; ADL: # %bb.0: ; ADL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 -; ADL-NEXT: vtestps %xmm0, %xmm0 +; ADL-NEXT: vmovmskps %xmm0, %eax +; ADL-NEXT: testl %eax, %eax ; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = fcmp oeq <4 x float> %a0, zeroinitializer @@ -207,9 +210,9 @@ ; ADL: # %bb.0: ; ADL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 -; ADL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; ADL-NEXT: vtestps %xmm1, %xmm0 -; ADL-NEXT: setb %al +; ADL-NEXT: vmovmskps %xmm0, %eax +; ADL-NEXT: cmpl $15, %eax +; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = fcmp oeq <4 x float> %a0, zeroinitializer %2 = sext <4 x i1> %1 to <4 x i32> @@ -513,10 +516,11 @@ ; ADL: # %bb.0: ; ADL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; ADL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; ADL-NEXT: vmovmskps %xmm0, %ecx ; ADL-NEXT: xorl %eax, %eax -; ADL-NEXT: vtestps %xmm1, %xmm0 -; ADL-NEXT: sbbl %eax, %eax +; ADL-NEXT: cmpl $15, %ecx +; ADL-NEXT: sete %al +; ADL-NEXT: negl %eax ; ADL-NEXT: retq %1 = icmp eq <16 x i8> %a0, zeroinitializer %2 = sext <16 x i1> %1 to <16 x i8>