Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37359,10 +37359,7 @@ return SDValue(); Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32); } else { - // Bail with AVX512VL (which uses predicate registers). - if (Subtarget.hasVLX()) - return SDValue(); - + // FIXME: Better handling of k-registers or 512-bit vectors? unsigned MatchSizeInBits = Match.getValueSizeInBits(); if (!(MatchSizeInBits == 128 || (MatchSizeInBits == 256 && Subtarget.hasAVX()))) Index: llvm/test/CodeGen/X86/vector-compare-all_of.ll =================================================================== --- llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -28,9 +28,11 @@ ; AVX512-LABEL: test_v2f64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $3, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negq %rax ; AVX512-NEXT: retq %c = fcmp ogt <2 x double> %a0, %a1 %s = sext <2 x i1> %c to <2 x i64> @@ -67,11 +69,11 @@ ; AVX512-LABEL: test_v4f64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %ymm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $15, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negq %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 @@ -115,12 +117,11 @@ ; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cltq +; AVX512-NEXT: vmovmskps %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $15, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negq %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 @@ -158,11 +159,11 @@ ; AVX512-LABEL: test_v4f32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $15, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: retq %c = fcmp ogt <4 x float> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -201,13 +202,11 @@ ; AVX512-LABEL: test_v8f32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %ymm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $255, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 @@ -252,14 +251,11 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k0 ; AVX512-NEXT: vpmovm2w %k0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cwtl +; AVX512-NEXT: vpmovmskb %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 @@ -299,9 +295,11 @@ ; AVX512-LABEL: test_v2i64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $3, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negq %rax ; AVX512-NEXT: retq %c = icmp sgt <2 x i64> %a0, %a1 %s = sext <2 x i1> %c to <2 x i64> @@ -353,11 +351,11 @@ ; AVX512-LABEL: test_v4i64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %ymm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $15, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negq %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 @@ -416,12 +414,11 @@ ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cltq +; AVX512-NEXT: vmovmskps %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $15, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negq %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 @@ -459,11 +456,11 @@ ; AVX512-LABEL: test_v4i32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $15, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: retq %c = icmp sgt <4 x i32> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -517,13 +514,11 @@ ; AVX512-LABEL: test_v8i32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %ymm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $255, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 @@ -583,14 +578,11 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ; AVX512-NEXT: vpmovm2w %k0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cwtl +; AVX512-NEXT: vpmovmskb %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 @@ -632,13 +624,11 @@ ; AVX512-LABEL: test_v8i16_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpmovmskb %xmm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq %c = icmp sgt <8 x i16> %a0, %a1 @@ -698,15 +688,11 @@ ; AVX512-LABEL: test_v16i16_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpmovmskb %ymm0, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: cmpl $-1, %ecx +; AVX512-NEXT: sete %al +; AVX512-NEXT: negl %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -772,15 +758,10 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: vpmovmskb %xmm0, %eax +; AVX512-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX512-NEXT: sete %al +; AVX512-NEXT: negb %al ; AVX512-NEXT: movsbl %al, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -822,16 +803,10 @@ ; AVX512-LABEL: test_v16i8_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vpmovmskb %xmm0, %eax +; AVX512-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX512-NEXT: sete %al +; AVX512-NEXT: negb %al ; AVX512-NEXT: retq %c = icmp sgt <16 x i8> %a0, %a1 %s = sext <16 x i1> %c to <16 x i8> @@ -886,18 +861,10 @@ ; AVX512-LABEL: test_v32i8_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: cmpl $-1, %eax +; AVX512-NEXT: sete %al +; AVX512-NEXT: negb %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <32 x i8> %a0, %a1 Index: llvm/test/CodeGen/X86/vector-compare-any_of.ll =================================================================== --- llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -24,9 +24,9 @@ ; AVX512-LABEL: test_v2f64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbq %rax, %rax ; AVX512-NEXT: retq %c = fcmp ogt <2 x double> %a0, %a1 %s = sext <2 x i1> %c to <2 x i64> @@ -59,11 +59,9 @@ ; AVX512-LABEL: test_v4f64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %ymm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbq %rax, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 @@ -103,12 +101,9 @@ ; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cltq +; AVX512-NEXT: vmovmskps %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbq %rax, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 @@ -142,11 +137,9 @@ ; AVX512-LABEL: test_v4f32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: retq %c = fcmp ogt <4 x float> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -181,13 +174,9 @@ ; AVX512-LABEL: test_v8f32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %ymm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 @@ -228,14 +217,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k0 ; AVX512-NEXT: vpmovm2w %k0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cwtl +; AVX512-NEXT: vpmovmskb %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 @@ -271,9 +255,9 @@ ; AVX512-LABEL: test_v2i64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbq %rax, %rax ; AVX512-NEXT: retq %c = icmp sgt <2 x i64> %a0, %a1 %s = sext <2 x i1> %c to <2 x i64> @@ -319,11 +303,9 @@ ; AVX512-LABEL: test_v4i64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovmskpd %ymm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbq %rax, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 @@ -376,12 +358,9 @@ ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cltq +; AVX512-NEXT: vmovmskps %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbq %rax, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 @@ -415,11 +394,9 @@ ; AVX512-LABEL: test_v4i32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: retq %c = icmp sgt <4 x i32> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -467,13 +444,9 @@ ; AVX512-LABEL: test_v8i32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovmskps %ymm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 @@ -527,14 +500,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ; AVX512-NEXT: vpmovm2w %k0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: cwtl +; AVX512-NEXT: vpmovmskb %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 @@ -572,13 +540,9 @@ ; AVX512-LABEL: test_v8i16_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpmovmskb %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq %c = icmp sgt <8 x i16> %a0, %a1 @@ -632,15 +596,9 @@ ; AVX512-LABEL: test_v16i16_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -700,16 +658,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: movsbl %al, %eax +; AVX512-NEXT: vpmovmskb %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -750,15 +701,9 @@ ; AVX512-LABEL: test_v16i8_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: vpmovmskb %xmm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq %c = icmp sgt <16 x i8> %a0, %a1 @@ -814,17 +759,9 @@ ; AVX512-LABEL: test_v32i8_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: sbbl %eax, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq