Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -335,6 +335,9 @@ // Vector integer comparisons, the result is in a mask vector. PCMPEQM, PCMPGTM, + // v8i16 Horizontal minimum and position. + PHMINPOS, + MULTISHIFT, /// Vector comparison generating mask bits for fp and Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -25073,6 +25073,7 @@ case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; + case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS"; case X86ISD::ADD: return "X86ISD::ADD"; case X86ISD::SUB: return "X86ISD::SUB"; case X86ISD::ADC: return "X86ISD::ADC"; @@ -30326,6 +30327,66 @@ return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); } +// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW. +static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Bail without SSE41. + if (!Subtarget.hasSSE41()) + return SDValue(); + + EVT ExtractVT = Extract->getValueType(0); + if (ExtractVT != MVT::i16) + return SDValue(); + + // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. + unsigned BinOp; + SDValue Src = matchBinOpReduction( + Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}); + if (!Src) + return SDValue(); + + EVT SrcVT = Src.getValueType(); + EVT SrcSVT = SrcVT.getScalarType(); + if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0) + return SDValue(); + + SDLoc DL(Extract); + SDValue MinPos = Src; + + // First, reduce the source down to 128-bit, applying BinOp to lo/hi. + while (SrcVT.getSizeInBits() > 128) { + unsigned NumElts = SrcVT.getVectorNumElements(); + unsigned NumSubElts = NumElts / 2; + SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts); + unsigned SubSizeInBits = SrcVT.getSizeInBits(); + SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits); + SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits); + MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi); + } + assert(SrcVT == MVT::v8i16 && "Unexpected value type"); + + // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask + // to flip the value accordingly. + SDValue Mask; + if (BinOp == ISD::SMAX) + Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT); + else if (BinOp == ISD::SMIN) + Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT); + else if (BinOp == ISD::UMAX) + Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT); + + if (Mask) + MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); + + MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos); + + if (Mask) + MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos, + DAG.getIntPtrConstant(0, DL)); +} + // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK. static SDValue combineHorizontalPredicateResult(SDNode *Extract, SelectionDAG &DAG, @@ -30633,6 +30694,10 @@ if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) return Cmp; + // Attempt to replace min/max v8i16 reductions with PHMINPOSUW. + if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) + return MinMax; + // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. if (SrcVT != MVT::v4i32) Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -182,6 +182,9 @@ def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>; def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>; +def X86phminpos: SDNode<"X86ISD::PHMINPOS", + SDTypeProfile<1, 1, [SDTCisVT<0, v8i16>, SDTCisVT<1, v8i16>]>>; + def X86vshiftuniform : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisVec<2>, SDTCisInt<0>, SDTCisInt<1>]>; Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -6188,22 +6188,20 @@ Sched<[WriteFAddLd]>, XS; } - - // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. multiclass SS41I_unop_rm_int_v16 opc, string OpcodeStr, - Intrinsic IntId128, PatFrag ld_frag, + SDNode OpNode, PatFrag ld_frag, X86FoldableSchedWrite Sched> { def rr128 : SS48I, + [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>, Sched<[Sched]>; def rm128 : SS48I, + (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>, Sched<[Sched.Folded]>; } @@ -6211,10 +6209,10 @@ // model, although the naming is misleading. let Predicates = [HasAVX] in defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", - int_x86_sse41_phminposuw, loadv2i64, + X86phminpos, loadv2i64, WriteVecIMul>, VEX, VEX_WIG; defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", - int_x86_sse41_phminposuw, memopv2i64, + X86phminpos, memopv2i64, WriteVecIMul>; /// SS48I_binop_rm - Simple SSE41 binary operator. Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -1679,6 +1679,7 @@ X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0), X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0), Index: llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll +++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -196,52 +196,68 @@ } define i16 @test_reduce_v8i16(<8 x i16> %a0) { -; X86-SSE-LABEL: test_reduce_v8i16: -; X86-SSE: ## BB#0: -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 -; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: psrld $16, %xmm1 -; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 -; X86-SSE-NEXT: movd %xmm1, %eax -; X86-SSE-NEXT: ## kill: %AX %AX %EAX -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: test_reduce_v8i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AX %AX %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AX %AX %EAX +; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: test_reduce_v8i16: ; X86-AVX: ## BB#0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovd %xmm0, %eax ; X86-AVX-NEXT: ## kill: %AX %AX %EAX ; X86-AVX-NEXT: retl ; -; X64-SSE-LABEL: test_reduce_v8i16: -; X64-SSE: ## BB#0: -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 -; X64-SSE-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE-NEXT: psrld $16, %xmm1 -; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 -; X64-SSE-NEXT: movd %xmm1, %eax -; X64-SSE-NEXT: ## kill: %AX %AX %EAX -; X64-SSE-NEXT: retq +; X64-SSE2-LABEL: test_reduce_v8i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AX %AX %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AX %AX %EAX +; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: test_reduce_v8i16: ; X64-AVX: ## BB#0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vmovd %xmm0, %eax ; X64-AVX-NEXT: ## kill: %AX %AX %EAX ; X64-AVX-NEXT: retq @@ -719,30 +735,39 @@ } define i16 @test_reduce_v16i16(<16 x i16> %a0) { -; X86-SSE-LABEL: test_reduce_v16i16: -; X86-SSE: ## BB#0: -; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 -; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: psrld $16, %xmm1 -; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 -; X86-SSE-NEXT: movd %xmm1, %eax -; X86-SSE-NEXT: ## kill: %AX %AX %EAX -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: test_reduce_v16i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AX %AX %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AX %AX %EAX +; X86-SSE42-NEXT: retl ; ; X86-AVX1-LABEL: test_reduce_v16i16: ; X86-AVX1: ## BB#0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: ## kill: %AX %AX %EAX ; X86-AVX1-NEXT: vzeroupper @@ -751,42 +776,49 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## BB#0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: %AX %AX %EAX ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl ; -; X64-SSE-LABEL: test_reduce_v16i16: -; X64-SSE: ## BB#0: -; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 -; X64-SSE-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE-NEXT: psrld $16, %xmm1 -; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 -; X64-SSE-NEXT: movd %xmm1, %eax -; X64-SSE-NEXT: ## kill: %AX %AX %EAX -; X64-SSE-NEXT: retq +; X64-SSE2-LABEL: test_reduce_v16i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AX %AX %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AX %AX %EAX +; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v16i16: ; X64-AVX1: ## BB#0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovd %xmm0, %eax ; X64-AVX1-NEXT: ## kill: %AX %AX %EAX ; X64-AVX1-NEXT: vzeroupper @@ -795,13 +827,11 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## BB#0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: %AX %AX %EAX ; X64-AVX2-NEXT: vzeroupper @@ -810,13 +840,11 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## BB#0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: %AX %AX %EAX ; X64-AVX512-NEXT: vzeroupper @@ -1511,21 +1539,34 @@ } define i16 @test_reduce_v32i16(<32 x i16> %a0) { -; X86-SSE-LABEL: test_reduce_v32i16: -; X86-SSE: ## BB#0: -; X86-SSE-NEXT: pmaxsw %xmm3, %xmm1 -; X86-SSE-NEXT: pmaxsw %xmm2, %xmm0 -; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 -; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: psrld $16, %xmm1 -; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 -; X86-SSE-NEXT: movd %xmm1, %eax -; X86-SSE-NEXT: ## kill: %AX %AX %EAX -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: test_reduce_v32i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pmaxsw %xmm3, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm0 +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AX %AX %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxsw %xmm3, %xmm1 +; X86-SSE42-NEXT: pmaxsw %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AX %AX %EAX +; X86-SSE42-NEXT: retl ; ; X86-AVX1-LABEL: test_reduce_v32i16: ; X86-AVX1: ## BB#0: @@ -1534,12 +1575,10 @@ ; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: ## kill: %AX %AX %EAX ; X86-AVX1-NEXT: vzeroupper @@ -1549,33 +1588,44 @@ ; X86-AVX2: ## BB#0: ; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: %AX %AX %EAX ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl ; -; X64-SSE-LABEL: test_reduce_v32i16: -; X64-SSE: ## BB#0: -; X64-SSE-NEXT: pmaxsw %xmm3, %xmm1 -; X64-SSE-NEXT: pmaxsw %xmm2, %xmm0 -; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 -; X64-SSE-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE-NEXT: psrld $16, %xmm1 -; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 -; X64-SSE-NEXT: movd %xmm1, %eax -; X64-SSE-NEXT: ## kill: %AX %AX %EAX -; X64-SSE-NEXT: retq +; X64-SSE2-LABEL: test_reduce_v32i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pmaxsw %xmm3, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm0 +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AX %AX %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxsw %xmm3, %xmm1 +; X64-SSE42-NEXT: pmaxsw %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AX %AX %EAX +; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v32i16: ; X64-AVX1: ## BB#0: @@ -1584,12 +1634,10 @@ ; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovd %xmm0, %eax ; X64-AVX1-NEXT: ## kill: %AX %AX %EAX ; X64-AVX1-NEXT: vzeroupper @@ -1599,13 +1647,11 @@ ; X64-AVX2: ## BB#0: ; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: %AX %AX %EAX ; X64-AVX2-NEXT: vzeroupper @@ -1614,15 +1660,13 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## BB#0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: %AX %AX %EAX ; X64-AVX512-NEXT: vzeroupper Index: llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll +++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -198,52 +198,68 @@ } define i16 @test_reduce_v8i16(<8 x i16> %a0) { -; X86-SSE-LABEL: test_reduce_v8i16: -; X86-SSE: ## BB#0: -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE-NEXT: pminsw %xmm0, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE-NEXT: pminsw %xmm1, %xmm0 -; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: psrld $16, %xmm1 -; X86-SSE-NEXT: pminsw %xmm0, %xmm1 -; X86-SSE-NEXT: movd %xmm1, %eax -; X86-SSE-NEXT: ## kill: %AX %AX %EAX -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: test_reduce_v8i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AX %AX %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AX %AX %EAX +; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: test_reduce_v8i16: ; X86-AVX: ## BB#0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovd %xmm0, %eax ; X86-AVX-NEXT: ## kill: %AX %AX %EAX ; X86-AVX-NEXT: retl ; -; X64-SSE-LABEL: test_reduce_v8i16: -; X64-SSE: ## BB#0: -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE-NEXT: pminsw %xmm0, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE-NEXT: pminsw %xmm1, %xmm0 -; X64-SSE-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE-NEXT: psrld $16, %xmm1 -; X64-SSE-NEXT: pminsw %xmm0, %xmm1 -; X64-SSE-NEXT: movd %xmm1, %eax -; X64-SSE-NEXT: ## kill: %AX %AX %EAX -; X64-SSE-NEXT: retq +; X64-SSE2-LABEL: test_reduce_v8i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AX %AX %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AX %AX %EAX +; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: test_reduce_v8i16: ; X64-AVX: ## BB#0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vmovd %xmm0, %eax ; X64-AVX-NEXT: ## kill: %AX %AX %EAX ; X64-AVX-NEXT: retq @@ -723,30 +739,39 @@ } define i16 @test_reduce_v16i16(<16 x i16> %a0) { -; X86-SSE-LABEL: test_reduce_v16i16: -; X86-SSE: ## BB#0: -; X86-SSE-NEXT: pminsw %xmm1, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE-NEXT: pminsw %xmm0, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE-NEXT: pminsw %xmm1, %xmm0 -; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: psrld $16, %xmm1 -; X86-SSE-NEXT: pminsw %xmm0, %xmm1 -; X86-SSE-NEXT: movd %xmm1, %eax -; X86-SSE-NEXT: ## kill: %AX %AX %EAX -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: test_reduce_v16i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AX %AX %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AX %AX %EAX +; X86-SSE42-NEXT: retl ; ; X86-AVX1-LABEL: test_reduce_v16i16: ; X86-AVX1: ## BB#0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: ## kill: %AX %AX %EAX ; X86-AVX1-NEXT: vzeroupper @@ -755,42 +780,49 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## BB#0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: %AX %AX %EAX ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl ; -; X64-SSE-LABEL: test_reduce_v16i16: -; X64-SSE: ## BB#0: -; X64-SSE-NEXT: pminsw %xmm1, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE-NEXT: pminsw %xmm0, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE-NEXT: pminsw %xmm1, %xmm0 -; X64-SSE-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE-NEXT: psrld $16, %xmm1 -; X64-SSE-NEXT: pminsw %xmm0, %xmm1 -; X64-SSE-NEXT: movd %xmm1, %eax -; X64-SSE-NEXT: ## kill: %AX %AX %EAX -; X64-SSE-NEXT: retq +; X64-SSE2-LABEL: test_reduce_v16i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AX %AX %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AX %AX %EAX +; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v16i16: ; X64-AVX1: ## BB#0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovd %xmm0, %eax ; X64-AVX1-NEXT: ## kill: %AX %AX %EAX ; X64-AVX1-NEXT: vzeroupper @@ -799,13 +831,11 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## BB#0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: %AX %AX %EAX ; X64-AVX2-NEXT: vzeroupper @@ -814,13 +844,11 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## BB#0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: %AX %AX %EAX ; X64-AVX512-NEXT: vzeroupper @@ -1513,21 +1541,34 @@ } define i16 @test_reduce_v32i16(<32 x i16> %a0) { -; X86-SSE-LABEL: test_reduce_v32i16: -; X86-SSE: ## BB#0: -; X86-SSE-NEXT: pminsw %xmm3, %xmm1 -; X86-SSE-NEXT: pminsw %xmm2, %xmm0 -; X86-SSE-NEXT: pminsw %xmm1, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE-NEXT: pminsw %xmm0, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE-NEXT: pminsw %xmm1, %xmm0 -; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: psrld $16, %xmm1 -; X86-SSE-NEXT: pminsw %xmm0, %xmm1 -; X86-SSE-NEXT: movd %xmm1, %eax -; X86-SSE-NEXT: ## kill: %AX %AX %EAX -; X86-SSE-NEXT: retl +; X86-SSE2-LABEL: test_reduce_v32i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pminsw %xmm3, %xmm1 +; X86-SSE2-NEXT: pminsw %xmm2, %xmm0 +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AX %AX %EAX +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminsw %xmm3, %xmm1 +; X86-SSE42-NEXT: pminsw %xmm2, %xmm0 +; X86-SSE42-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AX %AX %EAX +; X86-SSE42-NEXT: retl ; ; X86-AVX1-LABEL: test_reduce_v32i16: ; X86-AVX1: ## BB#0: @@ -1536,12 +1577,10 @@ ; X86-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: ## kill: %AX %AX %EAX ; X86-AVX1-NEXT: vzeroupper @@ -1551,33 +1590,44 @@ ; X86-AVX2: ## BB#0: ; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: %AX %AX %EAX ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl ; -; X64-SSE-LABEL: test_reduce_v32i16: -; X64-SSE: ## BB#0: -; X64-SSE-NEXT: pminsw %xmm3, %xmm1 -; X64-SSE-NEXT: pminsw %xmm2, %xmm0 -; X64-SSE-NEXT: pminsw %xmm1, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE-NEXT: pminsw %xmm0, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE-NEXT: pminsw %xmm1, %xmm0 -; X64-SSE-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE-NEXT: psrld $16, %xmm1 -; X64-SSE-NEXT: pminsw %xmm0, %xmm1 -; X64-SSE-NEXT: movd %xmm1, %eax -; X64-SSE-NEXT: ## kill: %AX %AX %EAX -; X64-SSE-NEXT: retq +; X64-SSE2-LABEL: test_reduce_v32i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pminsw %xmm3, %xmm1 +; X64-SSE2-NEXT: pminsw %xmm2, %xmm0 +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AX %AX %EAX +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminsw %xmm3, %xmm1 +; X64-SSE42-NEXT: pminsw %xmm2, %xmm0 +; X64-SSE42-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AX %AX %EAX +; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v32i16: ; X64-AVX1: ## BB#0: @@ -1586,12 +1636,10 @@ ; X64-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovd %xmm0, %eax ; X64-AVX1-NEXT: ## kill: %AX %AX %EAX ; X64-AVX1-NEXT: vzeroupper @@ -1601,13 +1649,11 @@ ; X64-AVX2: ## BB#0: ; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: %AX %AX %EAX ; X64-AVX2-NEXT: vzeroupper @@ -1616,15 +1662,13 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## BB#0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: %AX %AX %EAX ; X64-AVX512-NEXT: vzeroupper Index: llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll +++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -259,25 +259,20 @@ ; ; X86-SSE42-LABEL: test_reduce_v8i16: ; X86-SSE42: ## BB#0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 -; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: ## kill: %AX %AX %EAX ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: test_reduce_v8i16: ; X86-AVX: ## BB#0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovd %xmm0, %eax ; X86-AVX-NEXT: ## kill: %AX %AX %EAX ; X86-AVX-NEXT: retl @@ -318,25 +313,20 @@ ; ; X64-SSE42-LABEL: test_reduce_v8i16: ; X64-SSE42: ## BB#0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 -; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: ## kill: %AX %AX %EAX ; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: test_reduce_v8i16: ; X64-AVX: ## BB#0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vmovd %xmm0, %eax ; X64-AVX-NEXT: ## kill: %AX %AX %EAX ; X64-AVX-NEXT: retq @@ -879,14 +869,11 @@ ; X86-SSE42-LABEL: test_reduce_v16i16: ; X86-SSE42: ## BB#0: ; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 -; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: ## kill: %AX %AX %EAX ; X86-SSE42-NEXT: retl ; @@ -894,12 +881,10 @@ ; X86-AVX1: ## BB#0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: ## kill: %AX %AX %EAX ; X86-AVX1-NEXT: vzeroupper @@ -908,13 +893,11 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## BB#0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: %AX %AX %EAX ; X86-AVX2-NEXT: vzeroupper @@ -965,14 +948,11 @@ ; X64-SSE42-LABEL: test_reduce_v16i16: ; X64-SSE42: ## BB#0: ; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 -; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: ## kill: %AX %AX %EAX ; X64-SSE42-NEXT: retq ; @@ -980,12 +960,10 @@ ; X64-AVX1: ## BB#0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovd %xmm0, %eax ; X64-AVX1-NEXT: ## kill: %AX %AX %EAX ; X64-AVX1-NEXT: vzeroupper @@ -994,13 +972,11 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## BB#0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: %AX %AX %EAX ; X64-AVX2-NEXT: vzeroupper @@ -1009,13 +985,11 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## BB#0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: %AX %AX %EAX ; X64-AVX512-NEXT: vzeroupper @@ -1821,14 +1795,11 @@ ; X86-SSE42-NEXT: pmaxuw %xmm3, %xmm1 ; X86-SSE42-NEXT: pmaxuw %xmm2, %xmm0 ; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 -; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: ## kill: %AX %AX %EAX ; X86-SSE42-NEXT: retl ; @@ -1839,12 +1810,10 @@ ; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: ## kill: %AX %AX %EAX ; X86-AVX1-NEXT: vzeroupper @@ -1854,13 +1823,11 @@ ; X86-AVX2: ## BB#0: ; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: %AX %AX %EAX ; X86-AVX2-NEXT: vzeroupper @@ -1929,14 +1896,11 @@ ; X64-SSE42-NEXT: pmaxuw %xmm3, %xmm1 ; X64-SSE42-NEXT: pmaxuw %xmm2, %xmm0 ; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 -; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: ## kill: %AX %AX %EAX ; X64-SSE42-NEXT: retq ; @@ -1947,12 +1911,10 @@ ; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovd %xmm0, %eax ; X64-AVX1-NEXT: ## kill: %AX %AX %EAX ; X64-AVX1-NEXT: vzeroupper @@ -1962,13 +1924,11 @@ ; X64-AVX2: ## BB#0: ; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: %AX %AX %EAX ; X64-AVX2-NEXT: vzeroupper @@ -1977,15 +1937,13 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## BB#0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: %AX %AX %EAX ; X64-AVX512-NEXT: vzeroupper Index: llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll +++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -261,25 +261,14 @@ ; ; X86-SSE42-LABEL: test_reduce_v8i16: ; X86-SSE42: ## BB#0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 -; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: ## kill: %AX %AX %EAX ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: test_reduce_v8i16: ; X86-AVX: ## BB#0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX-NEXT: vmovd %xmm0, %eax ; X86-AVX-NEXT: ## kill: %AX %AX %EAX ; X86-AVX-NEXT: retl @@ -320,25 +309,14 @@ ; ; X64-SSE42-LABEL: test_reduce_v8i16: ; X64-SSE42: ## BB#0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 -; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: ## kill: %AX %AX %EAX ; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: test_reduce_v8i16: ; X64-AVX: ## BB#0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX-NEXT: vmovd %xmm0, %eax ; X64-AVX-NEXT: ## kill: %AX %AX %EAX ; X64-AVX-NEXT: retq @@ -885,14 +863,8 @@ ; X86-SSE42-LABEL: test_reduce_v16i16: ; X86-SSE42: ## BB#0: ; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 -; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: ## kill: %AX %AX %EAX ; X86-SSE42-NEXT: retl ; @@ -900,12 +872,7 @@ ; X86-AVX1: ## BB#0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: ## kill: %AX %AX %EAX ; X86-AVX1-NEXT: vzeroupper @@ -914,13 +881,8 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## BB#0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: %AX %AX %EAX ; X86-AVX2-NEXT: vzeroupper @@ -971,14 +933,8 @@ ; X64-SSE42-LABEL: test_reduce_v16i16: ; X64-SSE42: ## BB#0: ; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 -; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: ## kill: %AX %AX %EAX ; X64-SSE42-NEXT: retq ; @@ -986,12 +942,7 @@ ; X64-AVX1: ## BB#0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovd %xmm0, %eax ; X64-AVX1-NEXT: ## kill: %AX %AX %EAX ; X64-AVX1-NEXT: vzeroupper @@ -1000,13 +951,8 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## BB#0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: %AX %AX %EAX ; X64-AVX2-NEXT: vzeroupper @@ -1015,13 +961,8 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## BB#0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: %AX %AX %EAX ; X64-AVX512-NEXT: vzeroupper @@ -1825,14 +1766,8 @@ ; X86-SSE42-NEXT: pminuw %xmm3, %xmm1 ; X86-SSE42-NEXT: pminuw %xmm2, %xmm0 ; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 -; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: ## kill: %AX %AX %EAX ; X86-SSE42-NEXT: retl ; @@ -1843,12 +1778,7 @@ ; X86-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: ## kill: %AX %AX %EAX ; X86-AVX1-NEXT: vzeroupper @@ -1858,13 +1788,8 @@ ; X86-AVX2: ## BB#0: ; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: %AX %AX %EAX ; X86-AVX2-NEXT: vzeroupper @@ -1933,14 +1858,8 @@ ; X64-SSE42-NEXT: pminuw %xmm3, %xmm1 ; X64-SSE42-NEXT: pminuw %xmm2, %xmm0 ; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 -; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: ## kill: %AX %AX %EAX ; X64-SSE42-NEXT: retq ; @@ -1951,12 +1870,7 @@ ; X64-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovd %xmm0, %eax ; X64-AVX1-NEXT: ## kill: %AX %AX %EAX ; X64-AVX1-NEXT: vzeroupper @@ -1966,13 +1880,8 @@ ; X64-AVX2: ## BB#0: ; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: %AX %AX %EAX ; X64-AVX2-NEXT: vzeroupper @@ -1981,15 +1890,10 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## BB#0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: %AX %AX %EAX ; X64-AVX512-NEXT: vzeroupper