diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -422,6 +422,8 @@ //===----------------------------------------------------------------------===// // X86 Subtarget Tuning features //===----------------------------------------------------------------------===// +def TuningSlowVtest : SubtargetFeature<"slow-vtest", "HasSlowVtest", "true", + "VTEST instruction is slow">; def TuningSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; @@ -1166,7 +1168,8 @@ FeatureMOVDIRI, FeatureMOVDIR64B, FeatureWAITPKG]; - list ADLAdditionalTuning = [TuningPERMFalseDeps]; + list ADLAdditionalTuning = [TuningPERMFalseDeps, + TuningSlowVtest]; list ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning); list ADLFeatures = !listconcat(TRMFeatures, ADLAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48021,8 +48021,8 @@ // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V) // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V) // iff every element is referenced. - if (NumElts <= CmpBits && Subtarget.hasAVX() && IsOneUse && - (NumEltBits == 32 || NumEltBits == 64)) { + if (NumElts <= CmpBits && Subtarget.hasAVX() && !Subtarget.hasSlowVtest() && + IsOneUse && (NumEltBits == 32 || NumEltBits == 64)) { SDLoc DL(EFLAGS); MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits); MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts); diff --git a/llvm/test/CodeGen/X86/combine-movmsk.ll b/llvm/test/CodeGen/X86/combine-movmsk.ll --- a/llvm/test/CodeGen/X86/combine-movmsk.ll +++ b/llvm/test/CodeGen/X86/combine-movmsk.ll @@ -33,7 +33,8 @@ ; ADL: # %bb.0: ; ADL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqpd %xmm0, %xmm1, %xmm0 -; ADL-NEXT: vtestpd %xmm0, %xmm0 +; ADL-NEXT: vmovmskpd %xmm0, %eax +; ADL-NEXT: testl %eax, %eax ; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = fcmp oeq <2 x double> zeroinitializer, %a0 @@ -67,9 +68,9 @@ ; ADL: # %bb.0: ; ADL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqpd %xmm0, %xmm1, %xmm0 -; ADL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; ADL-NEXT: vtestpd %xmm1, %xmm0 -; ADL-NEXT: setb %al +; ADL-NEXT: vmovmskpd %xmm0, %eax +; ADL-NEXT: cmpl $3, %eax +; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = fcmp oeq <2 x double> zeroinitializer, %a0 %2 = sext <2 x i1> %1 to <2 x i64> @@ -103,7 +104,8 @@ ; ; ADL-LABEL: pmovmskb_noneof_bitcast_v2i64: ; ADL: # %bb.0: -; ADL-NEXT: vtestpd %xmm0, %xmm0 +; ADL-NEXT: vmovmskpd %xmm0, %eax +; ADL-NEXT: testl %eax, %eax ; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = icmp sgt <2 x i64> zeroinitializer, %a0 @@ -139,9 +141,9 @@ ; ; ADL-LABEL: pmovmskb_allof_bitcast_v2i64: ; ADL: # %bb.0: -; ADL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; ADL-NEXT: vtestpd %xmm1, %xmm0 -; ADL-NEXT: setb %al +; ADL-NEXT: vmovmskpd %xmm0, %eax +; ADL-NEXT: cmpl $3, %eax +; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = icmp sgt <2 x i64> zeroinitializer, %a0 %2 = sext <2 x i1> %1 to <2 x i64> @@ -173,7 +175,8 @@ ; ADL: # %bb.0: ; ADL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 -; ADL-NEXT: vtestps %xmm0, %xmm0 +; ADL-NEXT: vmovmskps %xmm0, %eax +; ADL-NEXT: testl %eax, %eax ; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = fcmp oeq <4 x float> %a0, zeroinitializer @@ -207,9 +210,9 @@ ; ADL: # %bb.0: ; ADL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 -; ADL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; ADL-NEXT: vtestps %xmm1, %xmm0 -; ADL-NEXT: setb %al +; ADL-NEXT: vmovmskps %xmm0, %eax +; ADL-NEXT: cmpl $15, %eax +; ADL-NEXT: sete %al ; ADL-NEXT: retq %1 = fcmp oeq <4 x float> %a0, zeroinitializer %2 = sext <4 x i1> %1 to <4 x i32> @@ -513,10 +516,11 @@ ; ADL: # %bb.0: ; ADL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ADL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; ADL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; ADL-NEXT: vmovmskps %xmm0, %ecx ; ADL-NEXT: xorl %eax, %eax -; ADL-NEXT: vtestps %xmm1, %xmm0 -; ADL-NEXT: sbbl %eax, %eax +; ADL-NEXT: cmpl $15, %ecx +; ADL-NEXT: sete %al +; ADL-NEXT: negl %eax ; ADL-NEXT: retq %1 = icmp eq <16 x i8> %a0, zeroinitializer %2 = sext <16 x i1> %1 to <16 x i8>