Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -39222,10 +39222,6 @@ // PHMINPOSUW. static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - // Bail without SSE41. - if (!Subtarget.hasSSE41()) - return SDValue(); - EVT ExtractVT = Extract->getValueType(0); if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8) return SDValue(); @@ -39243,19 +39239,48 @@ return SDValue(); SDLoc DL(Extract); - SDValue MinPos = Src; + SDValue Result = Src; // First, reduce the source down to 128-bit, applying BinOp to lo/hi. while (SrcVT.getSizeInBits() > 128) { SDValue Lo, Hi; - std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL); + std::tie(Lo, Hi) = splitVector(Result, DAG, DL); SrcVT = Lo.getValueType(); - MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi); + Result = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi); } assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && "Unexpected value type"); + // PHMINPOSUW is only available when SSE41 is supported. If it is not supported + // we just rewrite UMIN/UMAX of v8i16 into SMIN/SMAX to avoid the inefficient + // lowering into pairs of PSUBUSW and PADDW/PSUBW. + if (!Subtarget.hasSSE41()) + { + if (SrcVT != MVT::v8i16 || (BinOp != ISD::UMAX && BinOp != ISD::UMIN)) + return SDValue(); + SDValue Mask = DAG.getConstant( + APInt::getSignedMinValue(ExtractVT.getSizeInBits()), DL, SrcVT); + Result = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, Result); + + // Emit a reduction tree with SMIN/SMAX operations. + ISD::NodeType SignedBinOp = BinOp == ISD::UMAX ? ISD::SMAX : ISD::SMIN; + Result = DAG.getNode(SignedBinOp, DL, SrcVT, + DAG.getVectorShuffle(SrcVT, DL, Result, DAG.getUNDEF(SrcVT), + {4, 5, 6, 7, -1, -1, -1, -1}), Result); + Result = DAG.getNode(SignedBinOp, DL, SrcVT, + DAG.getVectorShuffle(SrcVT, DL, Result, DAG.getUNDEF(SrcVT), + {2, 3, -1, -1, -1, -1, -1, -1}), Result); + Result = DAG.getNode(SignedBinOp, DL, SrcVT, + DAG.getVectorShuffle(SrcVT, DL, Result, DAG.getUNDEF(SrcVT), + {1, -1, -1, -1, -1, -1, -1, -1}), Result); + + // Flip the sign bit back and return. + Result = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, Result); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, Result, + DAG.getIntPtrConstant(0, DL)); + } + // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask // to flip the value accordingly. SDValue Mask; @@ -39268,7 +39293,7 @@ Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT); if (Mask) - MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); + Result = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, Result); // For v16i8 cases we need to perform UMIN on pairs of byte elements, // shuffling each upper element down and insert zeros. This means that the @@ -39276,20 +39301,20 @@ // ready for the PHMINPOS. if (ExtractVT == MVT::i8) { SDValue Upper = DAG.getVectorShuffle( - SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8), + SrcVT, DL, Result, DAG.getConstant(0, DL, MVT::v16i8), {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16}); - MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper); + Result = DAG.getNode(ISD::UMIN, DL, SrcVT, Result, Upper); } // Perform the PHMINPOS on a v8i16 vector, - MinPos = DAG.getBitcast(MVT::v8i16, MinPos); - MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos); - MinPos = DAG.getBitcast(SrcVT, MinPos); + Result = DAG.getBitcast(MVT::v8i16, Result); + Result = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, Result); + Result = DAG.getBitcast(SrcVT, Result); if (Mask) - MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); + Result = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, Result); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos, + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, Result, DAG.getIntPtrConstant(0, DL)); } Index: llvm/test/CodeGen/X86/horizontal-reduce-umax.ll =================================================================== --- llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -238,17 +238,16 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v8i16: ; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pxor LCPI2_0, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X86-SSE2-NEXT: paddw %xmm0, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X86-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X86-SSE2-NEXT: paddw %xmm1, %xmm0 +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X86-SSE2-NEXT: paddw %xmm0, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -274,17 +273,16 @@ ; ; X64-SSE2-LABEL: test_reduce_v8i16: ; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X64-SSE2-NEXT: paddw %xmm0, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X64-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X64-SSE2-NEXT: paddw %xmm1, %xmm0 +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X64-SSE2-NEXT: paddw %xmm0, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -826,17 +824,16 @@ ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 ; X86-SSE2-NEXT: paddw %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor LCPI6_0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X86-SSE2-NEXT: paddw %xmm1, %xmm0 +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X86-SSE2-NEXT: paddw %xmm0, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: psrld $16, %xmm0 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X86-SSE2-NEXT: paddw %xmm1, %xmm0 +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -881,17 +878,16 @@ ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 ; X64-SSE2-NEXT: paddw %xmm0, %xmm1 +; X64-SSE2-NEXT: pxor {{.*}}(%rip), %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; X64-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X64-SSE2-NEXT: paddw %xmm1, %xmm0 +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X64-SSE2-NEXT: paddw %xmm0, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE2-NEXT: psrld $16, %xmm0 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X64-SSE2-NEXT: paddw %xmm1, %xmm0 +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -1648,17 +1644,16 @@ ; X86-SSE2-NEXT: paddw %xmm1, %xmm3 ; X86-SSE2-NEXT: psubusw %xmm2, %xmm3 ; X86-SSE2-NEXT: paddw %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor LCPI10_0, %xmm3 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: psubusw %xmm3, %xmm0 -; X86-SSE2-NEXT: paddw %xmm3, %xmm0 +; X86-SSE2-NEXT: pmaxsw %xmm3, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X86-SSE2-NEXT: paddw %xmm0, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: psrld $16, %xmm0 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X86-SSE2-NEXT: paddw %xmm1, %xmm0 +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -1713,17 +1708,16 @@ ; X64-SSE2-NEXT: paddw %xmm1, %xmm3 ; X64-SSE2-NEXT: psubusw %xmm2, %xmm3 ; X64-SSE2-NEXT: paddw %xmm2, %xmm3 +; X64-SSE2-NEXT: pxor {{.*}}(%rip), %xmm3 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; X64-SSE2-NEXT: psubusw %xmm3, %xmm0 -; X64-SSE2-NEXT: paddw %xmm3, %xmm0 +; X64-SSE2-NEXT: pmaxsw %xmm3, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X64-SSE2-NEXT: paddw %xmm0, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE2-NEXT: psrld $16, %xmm0 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X64-SSE2-NEXT: paddw %xmm1, %xmm0 +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -1985,17 +1979,16 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pxor LCPI12_0, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X86-SSE2-NEXT: paddw %xmm0, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X86-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X86-SSE2-NEXT: paddw %xmm1, %xmm0 +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X86-SSE2-NEXT: paddw %xmm0, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -2022,17 +2015,16 @@ ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X64-SSE2-NEXT: paddw %xmm0, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X64-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X64-SSE2-NEXT: paddw %xmm1, %xmm0 +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X64-SSE2-NEXT: paddw %xmm0, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -2093,17 +2085,16 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pxor LCPI13_0, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X86-SSE2-NEXT: paddw %xmm0, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X86-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X86-SSE2-NEXT: paddw %xmm1, %xmm0 +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X86-SSE2-NEXT: paddw %xmm0, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -2130,17 +2121,16 @@ ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X64-SSE2-NEXT: paddw %xmm0, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X64-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X64-SSE2-NEXT: paddw %xmm1, %xmm0 +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X64-SSE2-NEXT: paddw %xmm0, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; Index: llvm/test/CodeGen/X86/horizontal-reduce-umin.ll =================================================================== --- llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -240,20 +240,16 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v8i16: ; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pxor LCPI2_0, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -273,20 +269,16 @@ ; ; X64-SSE2-LABEL: test_reduce_v8i16: ; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -773,20 +765,16 @@ ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X86-SSE2-NEXT: psubw %xmm2, %xmm0 +; X86-SSE2-NEXT: pxor LCPI6_0, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -823,20 +811,16 @@ ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X64-SSE2-NEXT: psubw %xmm2, %xmm0 +; X64-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -1569,20 +1553,16 @@ ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X86-SSE2-NEXT: psubw %xmm2, %xmm0 +; X86-SSE2-NEXT: pxor LCPI10_0, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -1631,20 +1611,16 @@ ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X64-SSE2-NEXT: psubw %xmm2, %xmm0 +; X64-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -1875,20 +1851,16 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pxor LCPI12_0, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -1909,20 +1881,16 @@ ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -1956,20 +1924,16 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X86-SSE2: ## %bb.0: +; X86-SSE2-NEXT: pxor LCPI13_0, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -1990,20 +1954,16 @@ ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: +; X64-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-reduce-umax.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -1363,17 +1363,16 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; SSE2-LABEL: test_v8i16: ; SSE2: # %bb.0: +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: psubusw %xmm0, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE2-NEXT: psubusw %xmm1, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: psubusw %xmm0, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; @@ -1425,17 +1424,16 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: psubusw %xmm0, %xmm1 ; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: psubusw %xmm1, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: psubusw %xmm0, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: psubusw %xmm1, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; @@ -1512,17 +1510,16 @@ ; SSE2-NEXT: paddw %xmm1, %xmm3 ; SSE2-NEXT: psubusw %xmm2, %xmm3 ; SSE2-NEXT: paddw %xmm2, %xmm3 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE2-NEXT: psubusw %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm3, %xmm0 +; SSE2-NEXT: pmaxsw %xmm3, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: psubusw %xmm0, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: psubusw %xmm1, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; @@ -1603,31 +1600,30 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE2-LABEL: test_v64i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psubusw %xmm1, %xmm5 -; SSE2-NEXT: paddw %xmm1, %xmm5 -; SSE2-NEXT: psubusw %xmm3, %xmm7 -; SSE2-NEXT: paddw %xmm3, %xmm7 ; SSE2-NEXT: psubusw %xmm0, %xmm4 ; SSE2-NEXT: paddw %xmm0, %xmm4 ; SSE2-NEXT: psubusw %xmm2, %xmm6 ; SSE2-NEXT: paddw %xmm2, %xmm6 ; SSE2-NEXT: psubusw %xmm4, %xmm6 ; SSE2-NEXT: paddw %xmm4, %xmm6 +; SSE2-NEXT: psubusw %xmm1, %xmm5 +; SSE2-NEXT: paddw %xmm1, %xmm5 +; SSE2-NEXT: psubusw %xmm3, %xmm7 +; SSE2-NEXT: paddw %xmm3, %xmm7 ; SSE2-NEXT: psubusw %xmm5, %xmm7 ; SSE2-NEXT: paddw %xmm5, %xmm7 ; SSE2-NEXT: psubusw %xmm6, %xmm7 ; SSE2-NEXT: paddw %xmm6, %xmm7 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE2-NEXT: psubusw %xmm7, %xmm0 -; SSE2-NEXT: paddw %xmm7, %xmm0 +; SSE2-NEXT: pmaxsw %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: psubusw %xmm0, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: psubusw %xmm1, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-reduce-umin.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -1370,20 +1370,16 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; SSE2-LABEL: test_v8i16: ; SSE2: # %bb.0: +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm2, %xmm0 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; @@ -1417,20 +1413,16 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psubusw %xmm1, %xmm2 ; SSE2-NEXT: psubw %xmm2, %xmm0 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm2, %xmm0 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; @@ -1487,20 +1479,16 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psubusw %xmm1, %xmm2 ; SSE2-NEXT: psubw %xmm2, %xmm0 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm2, %xmm0 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; @@ -1556,41 +1544,37 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE2-LABEL: test_v64i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: psubusw %xmm6, %xmm8 -; SSE2-NEXT: psubw %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psubusw %xmm4, %xmm6 -; SSE2-NEXT: psubw %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psubusw %xmm7, %xmm4 -; SSE2-NEXT: psubw %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psubusw %xmm5, %xmm4 -; SSE2-NEXT: psubw %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psubusw %xmm3, %xmm4 -; SSE2-NEXT: psubw %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: psubusw %xmm7, %xmm8 +; SSE2-NEXT: psubw %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: psubusw %xmm5, %xmm7 +; SSE2-NEXT: psubw %xmm7, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psubusw %xmm3, %xmm5 +; SSE2-NEXT: psubw %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: psubusw %xmm6, %xmm3 +; SSE2-NEXT: psubw %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psubusw %xmm4, %xmm3 +; SSE2-NEXT: psubw %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psubusw %xmm2, %xmm3 ; SSE2-NEXT: psubw %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psubusw %xmm1, %xmm2 ; SSE2-NEXT: psubw %xmm2, %xmm0 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm2, %xmm0 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ;