Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -17989,16 +17989,6 @@ } } - // We are handling one of the integer comparisons here. Since SSE only has - // GT and EQ comparisons for integer, swapping operands and multiple - // operations may be required for some comparisons. - unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ - : X86ISD::PCMPGT; - bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || - Cond == ISD::SETGE || Cond == ISD::SETUGE; - bool Invert = Cond == ISD::SETNE || - (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond)); - // If both operands are known non-negative, then an unsigned compare is the // same as a signed compare and there's no need to flip signbits. // TODO: We could check for more general simplifications here since we're @@ -18006,27 +17996,47 @@ bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) && !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1)); - // Special case: Use min/max operations for SETULE/SETUGE - MVT VET = VT.getVectorElementType(); - bool HasMinMax = - (Subtarget.hasAVX512() && VET == MVT::i64) || - (Subtarget.hasSSE41() && (VET == MVT::i16 || VET == MVT::i32)) || - (Subtarget.hasSSE2() && (VET == MVT::i8)); - bool MinMax = false; - if (HasMinMax) { + // Special case: Use min/max operations for unsigned compares. We only want + // to do this for unsigned compares if we need to flip signs or if it allows + // use to avoid an invert. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (ISD::isUnsignedIntSetCC(Cond) && + (FlipSigns || ISD::isTrueWhenEqual(Cond)) && + TLI.isOperationLegal(ISD::UMIN, VT)) { + bool Invert = false; + unsigned Opc; switch (Cond) { - default: break; - case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break; - case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break; + default: llvm_unreachable("Unexpected condition code"); + case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH; + case ISD::SETULE: Opc = ISD::UMIN; break; + case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH; + case ISD::SETUGE: Opc = ISD::UMAX; break; } - if (MinMax) - Swap = Invert = FlipSigns = false; + SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); + + // If the logical-not of the result is required, perform that now. + if (Invert) + Result = DAG.getNOT(dl, Result, VT); + + return Result; } + // We are handling one of the integer comparisons here. Since SSE only has + // GT and EQ comparisons for integer, swapping operands and multiple + // operations may be required for some comparisons. + unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ + : X86ISD::PCMPGT; + bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || + Cond == ISD::SETGE || Cond == ISD::SETUGE; + bool Invert = Cond == ISD::SETNE || + (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond)); + + MVT VET = VT.getVectorElementType(); bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); bool Subus = false; - if (!MinMax && HasSubus) { + if (HasSubus) { // As another special case, use PSUBUS[BW] when it's profitable. E.g. for // Op0 u<= Op1: // t = psubus Op0, Op1 @@ -18145,9 +18155,6 @@ if (Invert) Result = DAG.getNOT(dl, Result, VT); - if (MinMax) - Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); - if (Subus) Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, getZeroVector(VT, Subtarget, DAG, dl)); Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -989,11 +989,11 @@ define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) { ; KNL-LABEL: test_extractelement_v32i1: ; KNL: ## %bb.0: -; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1 +; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -1018,12 +1018,12 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) { ; KNL-LABEL: test_extractelement_v64i1: ; KNL: ## %bb.0: -; KNL-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; KNL-NEXT: vpxor %ymm0, %ymm3, %ymm2 -; KNL-NEXT: vpxor %ymm0, %ymm1, %ymm0 -; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpminub %ymm3, %ymm1, %ymm0 +; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0 +; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -1054,12 +1054,12 @@ define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) { ; KNL-LABEL: extractelement_v64i1_alt: ; KNL: ## %bb.0: -; KNL-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; KNL-NEXT: vpxor %ymm0, %ymm3, %ymm2 -; KNL-NEXT: vpxor %ymm0, %ymm1, %ymm0 -; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpminub %ymm3, %ymm1, %ymm0 +; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0 +; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -1645,10 +1645,9 @@ ; KNL-NEXT: andq $-32, %rsp ; KNL-NEXT: subq $64, %rsp ; KNL-NEXT: ## kill: def $edi killed $edi def $rdi -; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1 +; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: vmovdqa %ymm0, (%rsp) ; KNL-NEXT: andl $31, %edi ; KNL-NEXT: movzbl (%rsp,%rdi), %eax @@ -1707,9 +1706,10 @@ ; KNL-NEXT: andq $-32, %rsp ; KNL-NEXT: subq $64, %rsp ; KNL-NEXT: ## kill: def $esi killed $esi def $rsi -; KNL-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; KNL-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1 +; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: andl $31, %esi ; KNL-NEXT: testb %dil, %dil ; KNL-NEXT: vmovdqa %ymm0, (%rsp) @@ -1771,11 +1771,13 @@ ; KNL-NEXT: andq $-64, %rsp ; KNL-NEXT: subq $128, %rsp ; KNL-NEXT: ## kill: def $esi killed $esi def $rsi -; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; KNL-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpminub %ymm2, %ymm0, %ymm3 +; KNL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 +; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vpminub %ymm2, %ymm1, %ymm2 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 +; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 ; KNL-NEXT: andl $63, %esi ; KNL-NEXT: testb %dil, %dil ; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) @@ -1952,14 +1954,16 @@ ; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm3, %xmm3 ; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm3, %xmm3 ; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; KNL-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; KNL-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2 -; KNL-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; KNL-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1 -; KNL-NEXT: vpxor %ymm3, %ymm0, %ymm0 -; KNL-NEXT: vpcmpgtb %ymm3, %ymm0, %ymm0 ; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpminub %ymm3, %ymm2, %ymm4 +; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 +; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; KNL-NEXT: vpminub %ymm3, %ymm1, %ymm4 +; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1 +; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; KNL-NEXT: vpminub %ymm3, %ymm0, %ymm4 +; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: movl 744(%rbp), %eax ; KNL-NEXT: andl $127, %eax ; KNL-NEXT: cmpb $0, 736(%rbp) @@ -2173,15 +2177,19 @@ ; KNL-NEXT: andq $-128, %rsp ; KNL-NEXT: subq $256, %rsp ## imm = 0x100 ; KNL-NEXT: ## kill: def $esi killed $esi def $rsi -; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; KNL-NEXT: vpxor %ymm4, %ymm0, %ymm0 -; KNL-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0 -; KNL-NEXT: vpxor %ymm4, %ymm1, %ymm1 -; KNL-NEXT: vpcmpgtb %ymm4, %ymm1, %ymm1 -; KNL-NEXT: vpxor %ymm4, %ymm2, %ymm2 -; KNL-NEXT: vpcmpgtb %ymm4, %ymm2, %ymm2 -; KNL-NEXT: vpxor %ymm4, %ymm3, %ymm3 -; KNL-NEXT: vpcmpgtb %ymm4, %ymm3, %ymm3 +; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; KNL-NEXT: vpminub %ymm4, %ymm0, %ymm5 +; KNL-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm0 +; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vpminub %ymm4, %ymm1, %ymm5 +; KNL-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm1 +; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; KNL-NEXT: vpminub %ymm4, %ymm2, %ymm5 +; KNL-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm2 +; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; KNL-NEXT: vpminub %ymm4, %ymm3, %ymm4 +; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3 +; KNL-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 ; KNL-NEXT: andl $127, %esi ; KNL-NEXT: testb %dil, %dil ; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp) Index: test/CodeGen/X86/avx512vl-vec-masked-cmp.ll =================================================================== --- test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -16294,11 +16294,11 @@ ; ; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -16321,11 +16321,11 @@ ; ; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxub (%rdi), %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -16350,11 +16350,11 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -16381,11 +16381,11 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxub (%rsi), %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -16413,11 +16413,11 @@ ; ; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -16441,11 +16441,11 @@ ; ; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxub (%rdi), %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -16471,11 +16471,11 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -16502,11 +16502,11 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxub (%rsi), %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -16535,15 +16535,16 @@ ; ; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax @@ -16569,15 +16570,16 @@ ; ; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxub (%rdi), %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax @@ -16605,17 +16607,18 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: shrl $16, %edi ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: andl %edi, %ecx @@ -16646,17 +16649,18 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxub (%rsi), %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: shrl $16, %edi ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: andl %edi, %ecx @@ -16688,11 +16692,11 @@ ; ; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def $ax killed $ax killed $eax @@ -16717,11 +16721,11 @@ ; ; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxuw (%rdi), %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def $ax killed $ax killed $eax @@ -16748,11 +16752,11 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -16781,11 +16785,11 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxuw (%rsi), %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -16814,11 +16818,11 @@ ; ; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -16841,11 +16845,11 @@ ; ; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxuw (%rdi), %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -16870,11 +16874,11 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -16901,11 +16905,11 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxuw (%rsi), %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -16933,11 +16937,11 @@ ; ; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -16961,11 +16965,11 @@ ; ; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxuw (%rdi), %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -16991,11 +16995,11 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -17023,11 +17027,11 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmaxuw (%rsi), %xmm0, %xmm1 +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -17057,11 +17061,11 @@ ; ; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -17085,11 +17089,11 @@ ; ; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxuw (%rdi), %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -17115,11 +17119,11 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -17147,11 +17151,11 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxuw (%rsi), %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -17180,11 +17184,11 @@ ; ; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -17209,11 +17213,11 @@ ; ; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxuw (%rdi), %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -17240,11 +17244,11 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -17272,11 +17276,11 @@ ; ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxuw (%rsi), %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -17475,17 +17479,18 @@ ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor %ymm4, %ymm2, %ymm2 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vpmaxuw %ymm2, %ymm0, %ymm2 +; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: vpxor %ymm4, %ymm1, %ymm0 -; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxuw %ymm3, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax @@ -17592,21 +17597,22 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor (%rdi), %ymm2, %ymm3 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm3, %ymm0 +; NoVLX-NEXT: vpmaxuw (%rdi), %ymm0, %ymm2 +; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm0 -; NoVLX-NEXT: vpxor 32(%rdi), %ymm2, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxuw 32(%rdi), %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax @@ -17641,17 +17647,17 @@ ; NoVLX-NEXT: movq %rax, %rdx ; NoVLX-NEXT: vmovd %eax, %xmm4 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: vmovq %xmm9, %rax -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 ; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm6 ; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 ; NoVLX-NEXT: vpextrq $1, %xmm3, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 ; NoVLX-NEXT: movl %edx, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 @@ -17664,14 +17670,14 @@ ; NoVLX-NEXT: vpinsrw $7, %edx, %xmm3, %xmm10 ; NoVLX-NEXT: movl %ecx, %edx ; NoVLX-NEXT: shrl $16, %edx -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rcx, %rdx ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 ; NoVLX-NEXT: vpextrq $1, %xmm2, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 ; NoVLX-NEXT: movl %edx, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %edx, %xmm2, %xmm2 @@ -17681,7 +17687,7 @@ ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rdx -; NoVLX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm4 ; NoVLX-NEXT: movl %ecx, %edx ; NoVLX-NEXT: shrl $16, %edx ; NoVLX-NEXT: vmovd %ecx, %xmm2 @@ -17739,7 +17745,7 @@ ; NoVLX-NEXT: movq %rdx, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vmovq %xmm5, %rcx ; NoVLX-NEXT: shrq $48, %rdx ; NoVLX-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %edx @@ -17788,35 +17794,36 @@ ; NoVLX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: shrq $48, %rdx ; NoVLX-NEXT: vpinsrw $7, %edx, %xmm1, %xmm1 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax ; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm3 ; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm2, %ymm3, %ymm3 -; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vpmaxuw %ymm1, %ymm3, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm1 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: andl %edi, %ecx ; NoVLX-NEXT: shrl $16, %edi ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm3 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpxor %ymm2, %ymm3, %ymm1 -; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; NoVLX-NEXT: vpmaxuw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %edx ; NoVLX-NEXT: andl %edi, %edx @@ -17929,22 +17936,23 @@ ; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm4 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm4, %ymm0 +; NoVLX-NEXT: vpmaxuw (%rsi), %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: shrq $48, %rdx -; NoVLX-NEXT: vpinsrw $7, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm3, %xmm1 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: shrl $16, %edi -; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm0 -; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor 32(%rsi), %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm0 +; NoVLX-NEXT: vpmaxuw 32(%rsi), %ymm0, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: andl %edi, %ecx Index: test/CodeGen/X86/psubus.ll =================================================================== --- test/CodeGen/X86/psubus.ll +++ test/CodeGen/X86/psubus.ll @@ -214,12 +214,14 @@ ; AVX1-LABEL: test8: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534] -; AVX1-NEXT: vpcmpgtw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpgtw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32766,32766,32766,32766,32766,32766,32766,32766] +; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769] ; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 @@ -338,12 +340,14 @@ ; AVX1-LABEL: test11: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] -; AVX1-NEXT: vpcmpgtb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [126,126,126,126,126,126,126,126,126,126,126,126,126,126,126,126] +; AVX1-NEXT: vpminub %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 @@ -496,22 +500,23 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: psubd %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm1 -; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm7 +; SSE41-NEXT: pmaxud %xmm2, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: pshufb %xmm6, %xmm7 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] ; SSE41-NEXT: psubd %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm1, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm3 +; SSE41-NEXT: psubd %xmm1, %xmm4 +; SSE41-NEXT: pshufb %xmm6, %xmm4 +; SSE41-NEXT: pshufb %xmm6, %xmm3 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE41-NEXT: pandn %xmm4, %xmm0 ; SSE41-NEXT: retq @@ -521,15 +526,15 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 -; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmaxud %xmm5, %xmm2, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm2, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] @@ -543,10 +548,10 @@ ; AVX2-LABEL: test13: ; AVX2: # %bb.0: # %vector.ph ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 @@ -682,44 +687,43 @@ ; ; SSE41-LABEL: test14: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: psubd %xmm6, %xmm4 -; SSE41-NEXT: por %xmm5, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = -; SSE41-NEXT: pshufb %xmm10, %xmm6 -; SSE41-NEXT: movdqa %xmm3, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: psubd %xmm9, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm7, %xmm9 -; SSE41-NEXT: pshufb %xmm10, %xmm9 -; SSE41-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: psubd %xmm0, %xmm1 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pmaxud %xmm10, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = +; SSE41-NEXT: pshufb %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: pmaxud %xmm9, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm6, %xmm5 +; SSE41-NEXT: pshufb %xmm7, %xmm5 +; SSE41-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmaxud %xmm8, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm12 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm12, %xmm0 ; SSE41-NEXT: movdqa %xmm2, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: por %xmm8, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE41-NEXT: pshufb %xmm6, %xmm5 -; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4,5,6,7] -; SSE41-NEXT: psubd %xmm8, %xmm2 +; SSE41-NEXT: pmaxud %xmm11, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE41-NEXT: pxor %xmm6, %xmm7 +; SSE41-NEXT: pshufb %xmm12, %xmm7 +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: psubd %xmm11, %xmm2 +; SSE41-NEXT: psubd %xmm8, %xmm1 +; SSE41-NEXT: psubd %xmm9, %xmm3 +; SSE41-NEXT: psubd %xmm10, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE41-NEXT: pand %xmm5, %xmm4 ; SSE41-NEXT: pand %xmm5, %xmm3 @@ -737,31 +741,31 @@ ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 -; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm3 -; AVX1-NEXT: vpor %xmm6, %xmm0, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm4 -; AVX1-NEXT: vpor %xmm6, %xmm10, %xmm5 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm11 -; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm4 -; AVX1-NEXT: vpor %xmm6, %xmm9, %xmm5 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm3 -; AVX1-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpacksswb %xmm11, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm8, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpmaxud %xmm0, %xmm6, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7 +; AVX1-NEXT: vpmaxud %xmm11, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm10 +; AVX1-NEXT: vpmaxud %xmm9, %xmm1, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm7, %xmm1, %xmm7 +; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpmaxud %xmm8, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpacksswb %xmm10, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm8, %xmm4, %xmm4 ; AVX1-NEXT: vpsubd %xmm9, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm10, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpsubd %xmm11, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 @@ -779,18 +783,18 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm6 -; AVX2-NEXT: vpcmpgtd %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm6 -; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm4 -; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm4 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4 +; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 +; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vpmaxud %ymm3, %ymm2, %ymm6 +; AVX2-NEXT: vpcmpeqd %ymm6, %ymm2, %ymm6 +; AVX2-NEXT: vpxor %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 +; AVX2-NEXT: vpacksswb %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vpsubd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] @@ -881,23 +885,25 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: psubd %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: por %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: packssdw %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pminud %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm6, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm7 +; SSE41-NEXT: pminud %xmm2, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: pshufb %xmm6, %xmm7 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] ; SSE41-NEXT: psubd %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm1, %xmm3 +; SSE41-NEXT: psubd %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm6, %xmm0 +; SSE41-NEXT: pshufb %xmm6, %xmm3 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test15: @@ -905,15 +911,15 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm5 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 -; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpminud %xmm5, %xmm2, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm2, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] @@ -927,10 +933,10 @@ ; AVX2-LABEL: test15: ; AVX2: # %bb.0: # %vector.ph ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 @@ -1015,23 +1021,25 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: psubd %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: por %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: packssdw %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pmaxud %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm6, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm7 +; SSE41-NEXT: pmaxud %xmm3, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: pshufb %xmm6, %xmm7 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] ; SSE41-NEXT: psubd %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm1, %xmm3 +; SSE41-NEXT: psubd %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm6, %xmm0 +; SSE41-NEXT: pshufb %xmm6, %xmm3 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test16: @@ -1039,15 +1047,15 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm5 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 -; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmaxud %xmm2, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] @@ -1061,10 +1069,10 @@ ; AVX2-LABEL: test16: ; AVX2: # %bb.0: # %vector.ph ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 Index: test/CodeGen/X86/vec_cmp_uint-128.ll =================================================================== --- test/CodeGen/X86/vec_cmp_uint-128.ll +++ test/CodeGen/X86/vec_cmp_uint-128.ll @@ -506,46 +506,81 @@ ; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE42-NEXT: retq ; -; AVX-LABEL: gt_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: gt_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: gt_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: gt_v2i64: ; XOP: # %bb.0: ; XOP-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq +; +; AVX512-LABEL: gt_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = icmp ugt <2 x i64> %a, %b %2 = sext <2 x i1> %1 to <2 x i64> ret <2 x i64> %2 } define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { -; SSE-LABEL: gt_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: gt_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: gt_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pminud %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; SSE42-LABEL: gt_v4i32: +; SSE42: # %bb.0: +; SSE42-NEXT: pminud %xmm0, %xmm1 +; SSE42-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm0 +; SSE42-NEXT: retq ; ; AVX1-LABEL: gt_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: gt_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: gt_v4i32: @@ -555,10 +590,11 @@ ; ; AVX512-LABEL: gt_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp ugt <4 x i32> %a, %b %2 = sext <4 x i1> %1 to <4 x i32> @@ -566,26 +602,59 @@ } define <8 x i16> @gt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { -; SSE-LABEL: gt_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: gt_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: gt_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE41-LABEL: gt_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pminuw %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; SSE42-LABEL: gt_v8i16: +; SSE42: # %bb.0: +; SSE42-NEXT: pminuw %xmm0, %xmm1 +; SSE42-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: gt_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: gt_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: gt_v8i16: ; XOP: # %bb.0: ; XOP-NEXT: vpcomgtuw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq +; +; AVX512-LABEL: gt_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = icmp ugt <8 x i16> %a, %b %2 = sext <8 x i1> %1 to <8 x i16> ret <8 x i16> %2 @@ -594,24 +663,41 @@ define <16 x i8> @gt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE-LABEL: gt_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE-NEXT: pminub %xmm0, %xmm1 +; SSE-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: gt_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: gt_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: gt_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: gt_v16i8: ; XOP: # %bb.0: ; XOP-NEXT: vpcomgtub %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq +; +; AVX512-LABEL: gt_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = icmp ugt <16 x i8> %a, %b %2 = sext <16 x i1> %1 to <16 x i8> ret <16 x i8> %2 @@ -843,47 +929,82 @@ ; SSE42-NEXT: movdqa %xmm2, %xmm0 ; SSE42-NEXT: retq ; -; AVX-LABEL: lt_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: lt_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: lt_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: lt_v2i64: ; XOP: # %bb.0: ; XOP-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq +; +; AVX512-LABEL: lt_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = icmp ult <2 x i64> %a, %b %2 = sext <2 x i1> %1 to <2 x i64> ret <2 x i64> %2 } define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { -; SSE-LABEL: lt_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm2 -; SSE-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: lt_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: lt_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxud %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; SSE42-LABEL: lt_v4i32: +; SSE42: # %bb.0: +; SSE42-NEXT: pmaxud %xmm0, %xmm1 +; SSE42-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm0 +; SSE42-NEXT: retq ; ; AVX1-LABEL: lt_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: lt_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: lt_v4i32: @@ -893,10 +1014,11 @@ ; ; AVX512-LABEL: lt_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp ult <4 x i32> %a, %b %2 = sext <4 x i1> %1 to <4 x i32> @@ -904,27 +1026,60 @@ } define <8 x i16> @lt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { -; SSE-LABEL: lt_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm2 -; SSE-NEXT: pcmpgtw %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: lt_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtw %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: lt_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; SSE41-LABEL: lt_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxuw %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; SSE42-LABEL: lt_v8i16: +; SSE42: # %bb.0: +; SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; SSE42-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: lt_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: lt_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: lt_v8i16: ; XOP: # %bb.0: ; XOP-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq +; +; AVX512-LABEL: lt_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = icmp ult <8 x i16> %a, %b %2 = sext <8 x i1> %1 to <8 x i16> ret <8 x i16> %2 @@ -933,25 +1088,41 @@ define <16 x i8> @lt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE-LABEL: lt_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm2 -; SSE-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pmaxub %xmm0, %xmm1 +; SSE-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: lt_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: lt_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: lt_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: lt_v16i8: ; XOP: # %bb.0: ; XOP-NEXT: vpcomltub %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq +; +; AVX512-LABEL: lt_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = icmp ult <16 x i8> %a, %b %2 = sext <16 x i1> %1 to <16 x i8> ret <16 x i8> %2 Index: test/CodeGen/X86/vec_minmax_match.ll =================================================================== --- test/CodeGen/X86/vec_minmax_match.ll +++ test/CodeGen/X86/vec_minmax_match.ll @@ -223,11 +223,12 @@ ; CHECK-LABEL: wrong_pred_for_smin_with_not: ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm1 -; CHECK-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: vpcmpgtd {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [4294967291,4294967291,4294967291,4294967291] -; CHECK-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm3 +; CHECK-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4294967291,4294967291,4294967291,4294967291] +; CHECK-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; CHECK-NEXT: retq %not_x = xor <4 x i32> %x, %cmp = icmp ugt <4 x i32> %x, Index: test/CodeGen/X86/vec_setcc-2.ll =================================================================== --- test/CodeGen/X86/vec_setcc-2.ll +++ test/CodeGen/X86/vec_setcc-2.ll @@ -5,26 +5,49 @@ ; For a setult against a constant, turn it into a setule and lower via psubusw. define void @loop_no_const_reload(<2 x i64>* %in, <2 x i64>* %out, i32 %n) { -; CHECK-LABEL: loop_no_const_reload: -; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: testl %edx, %edx -; CHECK-NEXT: je LBB0_3 -; CHECK-NEXT: ## %bb.1: ## %for.body.preheader -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [25,25,25,25,25,25,25,25] -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB0_2: ## %for.body -; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movdqa (%rdi,%rax), %xmm2 -; CHECK-NEXT: psubusw %xmm0, %xmm2 -; CHECK-NEXT: pcmpeqw %xmm1, %xmm2 -; CHECK-NEXT: movdqa %xmm2, (%rsi,%rax) -; CHECK-NEXT: addq $16, %rax -; CHECK-NEXT: decl %edx -; CHECK-NEXT: jne LBB0_2 -; CHECK-NEXT: LBB0_3: ## %for.end -; CHECK-NEXT: retq +; SSE2-LABEL: loop_no_const_reload: +; SSE2: ## %bb.0: ## %entry +; SSE2-NEXT: testl %edx, %edx +; SSE2-NEXT: je LBB0_3 +; SSE2-NEXT: ## %bb.1: ## %for.body.preheader +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [25,25,25,25,25,25,25,25] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: .p2align 4, 0x90 +; SSE2-NEXT: LBB0_2: ## %for.body +; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1 +; SSE2-NEXT: movdqa (%rdi,%rax), %xmm2 +; SSE2-NEXT: psubusw %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqw %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rsi,%rax) +; SSE2-NEXT: addq $16, %rax +; SSE2-NEXT: decl %edx +; SSE2-NEXT: jne LBB0_2 +; SSE2-NEXT: LBB0_3: ## %for.end +; SSE2-NEXT: retq +; +; SSE41-LABEL: loop_no_const_reload: +; SSE41: ## %bb.0: ## %entry +; SSE41-NEXT: testl %edx, %edx +; SSE41-NEXT: je LBB0_3 +; SSE41-NEXT: ## %bb.1: ## %for.body.preheader +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [26,26,26,26,26,26,26,26] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: .p2align 4, 0x90 +; SSE41-NEXT: LBB0_2: ## %for.body +; SSE41-NEXT: ## =>This Inner Loop Header: Depth=1 +; SSE41-NEXT: movdqa (%rdi,%rax), %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pmaxuw %xmm0, %xmm3 +; SSE41-NEXT: pcmpeqw %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm3, (%rsi,%rax) +; SSE41-NEXT: addq $16, %rax +; SSE41-NEXT: decl %edx +; SSE41-NEXT: jne LBB0_2 +; SSE41-NEXT: LBB0_3: ## %for.end +; SSE41-NEXT: retq entry: %cmp9 = icmp eq i32 %n, 0 br i1 %cmp9, label %for.end, label %for.body @@ -51,27 +74,50 @@ ; Be careful if decrementing the constant would undeflow. define void @loop_const_folding_underflow(<2 x i64>* %in, <2 x i64>* %out, i32 %n) { -; CHECK-LABEL: loop_const_folding_underflow: -; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: testl %edx, %edx -; CHECK-NEXT: je LBB1_3 -; CHECK-NEXT: ## %bb.1: ## %for.body.preheader -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [32768,32768,32768,32768,32768,32768,32768,32768] -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [32768,32794,32794,32794,32794,32794,32794,32794] -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB1_2: ## %for.body -; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movdqa (%rdi,%rax), %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm1, %xmm3 -; CHECK-NEXT: pcmpgtw %xmm2, %xmm3 -; CHECK-NEXT: movdqa %xmm3, (%rsi,%rax) -; CHECK-NEXT: addq $16, %rax -; CHECK-NEXT: decl %edx -; CHECK-NEXT: jne LBB1_2 -; CHECK-NEXT: LBB1_3: ## %for.end -; CHECK-NEXT: retq +; SSE2-LABEL: loop_const_folding_underflow: +; SSE2: ## %bb.0: ## %entry +; SSE2-NEXT: testl %edx, %edx +; SSE2-NEXT: je LBB1_3 +; SSE2-NEXT: ## %bb.1: ## %for.body.preheader +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32794,32794,32794,32794,32794,32794,32794] +; SSE2-NEXT: .p2align 4, 0x90 +; SSE2-NEXT: LBB1_2: ## %for.body +; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1 +; SSE2-NEXT: movdqa (%rdi,%rax), %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtw %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, (%rsi,%rax) +; SSE2-NEXT: addq $16, %rax +; SSE2-NEXT: decl %edx +; SSE2-NEXT: jne LBB1_2 +; SSE2-NEXT: LBB1_3: ## %for.end +; SSE2-NEXT: retq +; +; SSE41-LABEL: loop_const_folding_underflow: +; SSE41: ## %bb.0: ## %entry +; SSE41-NEXT: testl %edx, %edx +; SSE41-NEXT: je LBB1_3 +; SSE41-NEXT: ## %bb.1: ## %for.body.preheader +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,26,26,26,26,26,26,26] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: .p2align 4, 0x90 +; SSE41-NEXT: LBB1_2: ## %for.body +; SSE41-NEXT: ## =>This Inner Loop Header: Depth=1 +; SSE41-NEXT: movdqa (%rdi,%rax), %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pmaxuw %xmm0, %xmm3 +; SSE41-NEXT: pcmpeqw %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm3, (%rsi,%rax) +; SSE41-NEXT: addq $16, %rax +; SSE41-NEXT: decl %edx +; SSE41-NEXT: jne LBB1_2 +; SSE41-NEXT: LBB1_3: ## %for.end +; SSE41-NEXT: retq entry: %cmp9 = icmp eq i32 %n, 0 br i1 %cmp9, label %for.end, label %for.body @@ -100,9 +146,11 @@ define <16 x i8> @test_ult_byte(<16 x i8> %a) { ; CHECK-LABEL: test_ult_byte: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: psubusb {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; CHECK-NEXT: pmaxub %xmm0, %xmm1 ; CHECK-NEXT: pcmpeqb %xmm1, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-NEXT: pxor %xmm1, %xmm0 ; CHECK-NEXT: retq entry: %icmp = icmp ult <16 x i8> %a, @@ -114,14 +162,22 @@ ; register operands. define <8 x i16> @test_ult_register(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_ult_register: -; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; CHECK-NEXT: pxor %xmm2, %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtw %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: retq +; SSE2-LABEL: test_ult_register: +; SSE2: ## %bb.0: ## %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtw %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_ult_register: +; SSE41: ## %bb.0: ## %entry +; SSE41-NEXT: pmaxuw %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: retq entry: %icmp = icmp ult <8 x i16> %a, %b %sext = sext <8 x i1> %icmp to <8 x i16>