Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -34306,8 +34306,11 @@ ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) return SDValue(); - // Don't bother performing this for 2-element vectors. - if (Match.getValueType().getVectorNumElements() <= 2) + // Make sure this isn't a vector of 1 element. The perf win from using MOVMSK + // diminishes with less elements in the reduction, but it is generally better + // to get the comparison over to the GPRs as soon as possible to reduce the + // number of vector ops. + if (Match.getValueType().getVectorNumElements() < 2) return SDValue(); // Check that we are extracting a reduction of all sign bits. Index: llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll +++ llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll @@ -8,17 +8,21 @@ ; SSE-LABEL: test_v2f64_sext: ; SSE: # %bb.0: ; SSE-NEXT: cmpltpd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: movmskpd %xmm1, %ecx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: cmpl $3, %ecx +; SSE-NEXT: sete %al +; SSE-NEXT: negq %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_sext: ; AVX: # %bb.0: ; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %ecx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: cmpl $3, %ecx +; AVX-NEXT: sete %al +; AVX-NEXT: negq %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_sext: @@ -42,9 +46,11 @@ ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: andpd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: movmskpd %xmm2, %ecx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: cmpl $3, %ecx +; SSE-NEXT: sete %al +; SSE-NEXT: negq %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_sext: @@ -273,17 +279,21 @@ ; SSE-LABEL: test_v2i64_sext: ; SSE: # %bb.0: ; SSE-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %ecx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: cmpl $3, %ecx +; SSE-NEXT: sete %al +; SSE-NEXT: negq %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64_sext: ; AVX: # %bb.0: ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %ecx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: cmpl $3, %ecx +; AVX-NEXT: sete %al +; AVX-NEXT: negq %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2i64_sext: @@ -307,9 +317,11 @@ ; SSE-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %ecx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: cmpl $3, %ecx +; SSE-NEXT: sete %al +; SSE-NEXT: negq %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v4i64_sext: Index: llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll +++ llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll @@ -8,17 +8,17 @@ ; SSE-LABEL: test_v2f64_sext: ; SSE: # %bb.0: ; SSE-NEXT: cmpltpd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: movmskpd %xmm1, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_sext: ; AVX: # %bb.0: ; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbq %rax, %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_sext: @@ -42,9 +42,9 @@ ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: orpd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: movmskpd %xmm2, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_sext: @@ -255,17 +255,17 @@ ; SSE-LABEL: test_v2i64_sext: ; SSE: # %bb.0: ; SSE-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i64_sext: ; AVX: # %bb.0: ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovmskpd %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: sbbq %rax, %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2i64_sext: @@ -289,9 +289,9 @@ ; SSE-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: sbbq %rax, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v4i64_sext: