Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -30482,7 +30482,8 @@ return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); } -// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW. +// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with +// PHMINPOSUW. static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Bail without SSE41. @@ -30490,7 +30491,7 @@ return SDValue(); EVT ExtractVT = Extract->getValueType(0); - if (ExtractVT != MVT::i16) + if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8) return SDValue(); // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. @@ -30502,7 +30503,7 @@ EVT SrcVT = Src.getValueType(); EVT SrcSVT = SrcVT.getScalarType(); - if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0) + if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0) return SDValue(); SDLoc DL(Extract); @@ -30518,22 +30519,39 @@ SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits); MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi); } - assert(SrcVT == MVT::v8i16 && "Unexpected value type"); + assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || + (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && + "Unexpected value type"); // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask // to flip the value accordingly. SDValue Mask; + unsigned MaskEltsBits = ExtractVT.getSizeInBits(); if (BinOp == ISD::SMAX) - Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT); + Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT); else if (BinOp == ISD::SMIN) - Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT); + Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT); else if (BinOp == ISD::UMAX) - Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT); + Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT); if (Mask) MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); - MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos); + // For v16i8 cases we need to perform UMIN on pairs of byte elements, + // shuffling each upper element down and insert zeros. This means that the + // v16i8 UMIN will leave the upper element as zero, performing zero-extension + // ready for the PHMINPOS. + if (ExtractVT == MVT::i8) { + SDValue Upper = DAG.getVectorShuffle( + SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL), + {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16}); + MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper); + } + + // Perform the PHMINPOS on a v8i16 vector, + MinPos = DAG.getBitcast(MVT::v8i16, MinPos); + MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos); + MinPos = DAG.getBitcast(SrcVT, MinPos); if (Mask) MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); @@ -30849,7 +30867,7 @@ if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) return Cmp; - // Attempt to replace min/max v8i16 reductions with PHMINPOSUW. + // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW. if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) return MinMax; Index: llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll +++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -309,30 +309,25 @@ ; ; X86-SSE42-LABEL: test_reduce_v16i8: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: test_reduce_v16i8: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX-NEXT: retl @@ -371,30 +366,25 @@ ; ; X64-SSE42-LABEL: test_reduce_v16i8: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: test_reduce_v16i8: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX-NEXT: retq @@ -906,16 +896,13 @@ ; X86-SSE42-LABEL: test_reduce_v32i8: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -924,14 +911,12 @@ ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -940,15 +925,13 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -994,16 +977,13 @@ ; X64-SSE42-LABEL: test_reduce_v32i8: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -1012,14 +992,12 @@ ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -1028,15 +1006,13 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -1045,15 +1021,13 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper @@ -1743,16 +1717,13 @@ ; X86-SSE42-NEXT: pmaxsb %xmm3, %xmm1 ; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm0 ; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -1764,14 +1735,12 @@ ; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -1781,15 +1750,13 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -1847,16 +1814,13 @@ ; X64-SSE42-NEXT: pmaxsb %xmm3, %xmm1 ; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm0 ; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -1868,14 +1832,12 @@ ; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -1885,15 +1847,13 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -1902,17 +1862,15 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper Index: llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll +++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -311,30 +311,25 @@ ; ; X86-SSE42-LABEL: test_reduce_v16i8: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: test_reduce_v16i8: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX-NEXT: retl @@ -373,30 +368,25 @@ ; ; X64-SSE42-LABEL: test_reduce_v16i8: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: test_reduce_v16i8: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX-NEXT: retq @@ -910,16 +900,13 @@ ; X86-SSE42-LABEL: test_reduce_v32i8: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -928,14 +915,12 @@ ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -944,15 +929,13 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -998,16 +981,13 @@ ; X64-SSE42-LABEL: test_reduce_v32i8: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -1016,14 +996,12 @@ ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -1032,15 +1010,13 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -1049,15 +1025,13 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper @@ -1745,16 +1719,13 @@ ; X86-SSE42-NEXT: pminsb %xmm3, %xmm1 ; X86-SSE42-NEXT: pminsb %xmm2, %xmm0 ; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -1766,14 +1737,12 @@ ; X86-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -1783,15 +1752,13 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -1849,16 +1816,13 @@ ; X64-SSE42-NEXT: pminsb %xmm3, %xmm1 ; X64-SSE42-NEXT: pminsb %xmm2, %xmm0 ; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -1870,14 +1834,12 @@ ; X64-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -1887,15 +1849,13 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -1904,17 +1864,15 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper Index: llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll +++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -362,30 +362,25 @@ ; ; X86-SSE42-LABEL: test_reduce_v16i8: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: test_reduce_v16i8: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX-NEXT: retl @@ -408,30 +403,25 @@ ; ; X64-SSE42-LABEL: test_reduce_v16i8: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: test_reduce_v16i8: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX-NEXT: retq @@ -1031,16 +1021,13 @@ ; X86-SSE42-LABEL: test_reduce_v32i8: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -1049,14 +1036,12 @@ ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -1065,15 +1050,13 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -1099,16 +1082,13 @@ ; X64-SSE42-LABEL: test_reduce_v32i8: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -1117,14 +1097,12 @@ ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -1133,15 +1111,13 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -1150,15 +1126,13 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper @@ -1992,16 +1966,13 @@ ; X86-SSE42-NEXT: pmaxub %xmm3, %xmm1 ; X86-SSE42-NEXT: pmaxub %xmm2, %xmm0 ; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -2013,14 +1984,12 @@ ; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -2030,15 +1999,13 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -2068,16 +2035,13 @@ ; X64-SSE42-NEXT: pmaxub %xmm3, %xmm1 ; X64-SSE42-NEXT: pmaxub %xmm2, %xmm0 ; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -2089,14 +2053,12 @@ ; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -2106,15 +2068,13 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -2123,17 +2083,15 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper Index: llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll +++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -352,30 +352,19 @@ ; ; X86-SSE42-LABEL: test_reduce_v16i8: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminub %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminub %xmm1, %xmm0 ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 ; X86-SSE42-NEXT: pminub %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: test_reduce_v16i8: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX-NEXT: retl @@ -398,30 +387,19 @@ ; ; X64-SSE42-LABEL: test_reduce_v16i8: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminub %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminub %xmm1, %xmm0 ; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 ; X64-SSE42-NEXT: pminub %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: test_reduce_v16i8: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX-NEXT: retq @@ -1004,16 +982,10 @@ ; X86-SSE42-LABEL: test_reduce_v32i8: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pminub %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminub %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminub %xmm1, %xmm0 ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 ; X86-SSE42-NEXT: pminub %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -1022,14 +994,9 @@ ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -1038,15 +1005,10 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -1072,16 +1034,10 @@ ; X64-SSE42-LABEL: test_reduce_v32i8: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pminub %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminub %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminub %xmm1, %xmm0 ; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 ; X64-SSE42-NEXT: pminub %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -1090,14 +1046,9 @@ ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -1106,15 +1057,10 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -1123,15 +1069,10 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper @@ -1942,16 +1883,10 @@ ; X86-SSE42-NEXT: pminub %xmm3, %xmm1 ; X86-SSE42-NEXT: pminub %xmm2, %xmm0 ; X86-SSE42-NEXT: pminub %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminub %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminub %xmm1, %xmm0 ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 ; X86-SSE42-NEXT: pminub %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -1963,14 +1898,9 @@ ; X86-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -1980,15 +1910,10 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -2018,16 +1943,10 @@ ; X64-SSE42-NEXT: pminub %xmm3, %xmm1 ; X64-SSE42-NEXT: pminub %xmm2, %xmm0 ; X64-SSE42-NEXT: pminub %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminub %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminub %xmm1, %xmm0 ; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 ; X64-SSE42-NEXT: pminub %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -2039,14 +1958,9 @@ ; X64-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -2056,15 +1970,10 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -2073,17 +1982,12 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper