Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2415,6 +2415,8 @@ ISD::STRICT_FMA, ISD::FMINNUM, ISD::FMAXNUM, + ISD::FMINIMUM, + ISD::FMAXIMUM, ISD::SUB, ISD::LOAD, ISD::MLOAD, @@ -53577,6 +53579,77 @@ N->getOperand(0), N->getOperand(1)); } +static SDValue combineFMinimumFMaximum(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget)) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (!((Subtarget.hasSSE1() && VT == MVT::f32) || + (Subtarget.hasSSE2() && VT == MVT::f64) || + (Subtarget.hasFP16() && VT == MVT::f16) || + (VT.isVector() && TLI.isTypeLegal(VT)))) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDLoc DL(N); + const bool isMax = N->getOpcode() == ISD::FMAXIMUM; + const auto MinMaxOp = isMax ? X86ISD::FMAX : X86ISD::FMIN; + + // We want to use X86ISD::FMAX or X86ISD::FMIN and their semantic differ + // from what we need for NaN and -0.0 handling. + // Specifically if -0.0 and 0.0 are compared then the second operand returns. + // If any of operands are NaN then second operand returns. + // To reuse them we need some preparation. + // First of all, in case -0.0 and 0.0 comparison we need -0.0 to be second + // operand for MIN operation and 0.0 for MAX one. + // After that if the first operand is NaN we need to move it be second one. + // If NaN is second operand it is already in place to be returned. + // Taking into account that for regular operands we do not worry about their + // position (MIN/MAX will return the right value independent whether it is + // a first operand or second one), it gives us the following algorithm + // for MAX: + // If sign(Op1) then swap(Op0, Op1) + // if Op0 is NaN then Op1 = Op0 + // Result = MAX Op0, Op1 + // for MIN: + // If sign(Op0) then swap(Op0, Op1) + // if Op0 is NaN then Op1 = Op0 + // Result = MIN Op0, Op1 + + // If we do not care about -0.0 we can skip the first step. + bool SkipSingZero = DAG.getTarget().Options.NoSignedZerosFPMath || + N->getFlags().hasNoSignedZeros(); + if (!SkipSingZero) { + EVT IntVT = VT.changeTypeToInteger(); + SDValue BC = DAG.getBitcast(IntVT, Op0); + SDValue Zero = DAG.getConstant(0, DL, IntVT); + EVT IntSetCCType = + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), IntVT); + SDValue IsLess = DAG.getSetCC(DL, IntSetCCType, BC, Zero, ISD::SETLT); + SDValue NewOp0 = + DAG.getSelect(DL, VT, IsLess, isMax ? Op0 : Op1, isMax ? Op1 : Op0); + Op1 = DAG.getSelect(DL, VT, IsLess, isMax ? Op1 : Op0, isMax ? Op0 : Op1); + Op0 = NewOp0; + } + + // If the first operand is known to be never NaN we can skip the second step. + bool SkipNaN = + DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs(); + SkipNaN = SkipNaN || DAG.isKnownNeverNaN(Op0); + if (!SkipNaN) { + EVT SetCCType = + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO); + Op1 = DAG.getSelect(DL, VT, IsOp0Nan, Op0, Op1); + } + + return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1); +} + static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); @@ -57533,6 +57606,8 @@ case X86ISD::FMAX: return combineFMinFMax(N, DAG); case ISD::FMINNUM: case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget); + case ISD::FMINIMUM: + case ISD::FMAXIMUM: return combineFMinimumFMaximum(N, DAG, Subtarget); case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI); case X86ISD::CVTP2SI: Index: llvm/test/CodeGen/X86/fminimum_maximum.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/fminimum_maximum.ll @@ -0,0 +1,594 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 + +declare <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) +declare <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) +declare float @llvm.maximum.f32(float %a, float %b) +declare float @llvm.minimum.f32(float %a, float %b) + +; =================== Simple============================= +define float @max(float %a, float %b) { +; SSE-LABEL: max: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: js .LBB0_2 +; SSE-NEXT: # %bb.1: # %entry +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: .LBB0_2: # %entry +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: cmpunordss %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: andps %xmm0, %xmm4 +; SSE-NEXT: js .LBB0_4 +; SSE-NEXT: # %bb.3: # %entry +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: .LBB0_4: # %entry +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: orps %xmm4, %xmm3 +; SSE-NEXT: maxss %xmm3, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: max: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: js .LBB0_2 +; AVX1-NEXT: # %bb.1: # %entry +; AVX1-NEXT: vmovdqa %xmm1, %xmm2 +; AVX1-NEXT: .LBB0_2: # %entry +; AVX1-NEXT: js .LBB0_4 +; AVX1-NEXT: # %bb.3: # %entry +; AVX1-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-NEXT: .LBB0_4: # %entry +; AVX1-NEXT: vcmpunordss %xmm2, %xmm2, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: max: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovaps %xmm1, %xmm2 +; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k2 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k2} +; AVX512-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: retq +entry: + %res = call float @llvm.maximum.f32(float %a, float %b) + ret float %res +} + +define float @min(float %a, float %b) { +; SSE-LABEL: min: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: js .LBB1_2 +; SSE-NEXT: # %bb.1: # %entry +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: .LBB1_2: # %entry +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: cmpunordss %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: andps %xmm0, %xmm4 +; SSE-NEXT: js .LBB1_4 +; SSE-NEXT: # %bb.3: # %entry +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: .LBB1_4: # %entry +; SSE-NEXT: andnps %xmm2, %xmm3 +; SSE-NEXT: orps %xmm4, %xmm3 +; SSE-NEXT: minss %xmm3, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: min: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vmovdqa %xmm1, %xmm2 +; AVX1-NEXT: js .LBB1_2 +; AVX1-NEXT: # %bb.1: # %entry +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: .LBB1_2: # %entry +; AVX1-NEXT: js .LBB1_4 +; AVX1-NEXT: # %bb.3: # %entry +; AVX1-NEXT: vmovdqa %xmm1, %xmm0 +; AVX1-NEXT: .LBB1_4: # %entry +; AVX1-NEXT: vcmpunordss %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: min: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqa %xmm0, %xmm2 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k2 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k2} +; AVX512-NEXT: vminss %xmm1, %xmm2, %xmm0 +; AVX512-NEXT: retq +entry: + %res = call float @llvm.minimum.f32(float %a, float %b) + ret float %res +} + +define <2 x float> @maxv2(<2 x float> %a, <2 x float> %b) { +; SSE2-LABEL: maxv2: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: maxps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: maxv2: +; SSE4: # %bb.0: # %entry +; SSE4-NEXT: movaps %xmm0, %xmm2 +; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE4-NEXT: blendvps %xmm0, %xmm0, %xmm1 +; SSE4-NEXT: movaps %xmm1, %xmm0 +; SSE4-NEXT: cmpunordps %xmm1, %xmm0 +; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE4-NEXT: maxps %xmm2, %xmm1 +; SSE4-NEXT: movaps %xmm1, %xmm0 +; SSE4-NEXT: retq +; +; AVX-LABEL: maxv2: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm1 +; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %res = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) + ret <2 x float> %res +} + +define <2 x float> @minv2(<2 x float> %a, <2 x float> %b) { +; SSE2-LABEL: minv2: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: cmpunordps %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: minps %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: minv2: +; SSE4: # %bb.0: # %entry +; SSE4-NEXT: movaps %xmm0, %xmm2 +; SSE4-NEXT: movaps %xmm1, %xmm3 +; SSE4-NEXT: blendvps %xmm0, %xmm0, %xmm3 +; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE4-NEXT: movaps %xmm2, %xmm0 +; SSE4-NEXT: cmpunordps %xmm2, %xmm0 +; SSE4-NEXT: blendvps %xmm0, %xmm2, %xmm3 +; SSE4-NEXT: minps %xmm3, %xmm2 +; SSE4-NEXT: movaps %xmm2, %xmm0 +; SSE4-NEXT: retq +; +; AVX-LABEL: minv2: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm1 +; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %res = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) + ret <2 x float> %res +} + +; =================== nnan ============================= +define float @max_nnan(float %a, float %b) { +; SSE-LABEL: max_nnan: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: js .LBB4_1 +; SSE-NEXT: # %bb.2: # %entry +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: jmp .LBB4_3 +; SSE-NEXT: .LBB4_1: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: .LBB4_3: # %entry +; SSE-NEXT: maxss %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: max_nnan: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: js .LBB4_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB4_1: +; AVX1-NEXT: vmovaps %xmm1, %xmm2 +; AVX1-NEXT: vmaxss %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: max_nnan: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqa %xmm0, %xmm2 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: retq +entry: + %res = call nnan float @llvm.maximum.f32(float %a, float %b) + ret float %res +} + +define float @min_nnan(float %a, float %b) { +; SSE-LABEL: min_nnan: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: js .LBB5_1 +; SSE-NEXT: # %bb.2: # %entry +; SSE-NEXT: minss %xmm1, %xmm0 +; SSE-NEXT: retq +; SSE-NEXT: .LBB5_1: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: minss %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: min_nnan: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: js .LBB5_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB5_1: +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: vminss %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: min_nnan: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovaps %xmm1, %xmm2 +; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %res = call nnan float @llvm.minimum.f32(float %a, float %b) + ret float %res +} + +define <2 x float> @maxv2_nnan(<2 x float> %a, <2 x float> %b) { +; SSE2-LABEL: maxv2_nnan: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: maxv2_nnan: +; SSE4: # %bb.0: # %entry +; SSE4-NEXT: movaps %xmm0, %xmm2 +; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE4-NEXT: blendvps %xmm0, %xmm0, %xmm1 +; SSE4-NEXT: maxps %xmm2, %xmm1 +; SSE4-NEXT: movaps %xmm1, %xmm0 +; SSE4-NEXT: retq +; +; AVX-LABEL: maxv2_nnan: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmaxps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %res = call nnan <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) + ret <2 x float> %res +} + +define <2 x float> @minv2_nnan(<2 x float> %a, <2 x float> %b) { +; SSE2-LABEL: minv2_nnan: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: minps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: minv2_nnan: +; SSE4: # %bb.0: # %entry +; SSE4-NEXT: movaps %xmm1, %xmm2 +; SSE4-NEXT: blendvps %xmm0, %xmm0, %xmm2 +; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm0 +; SSE4-NEXT: minps %xmm2, %xmm0 +; SSE4-NEXT: retq +; +; AVX-LABEL: minv2_nnan: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %res = call nnan <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) + ret <2 x float> %res +} + +; =================== nsz ============================= +define float @max_nsz(float %a, float %b) { +; SSE-LABEL: max_nsz: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: cmpunordss %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: andps %xmm0, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: orps %xmm3, %xmm2 +; SSE-NEXT: maxss %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: max_nsz: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: max_nsz: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %res = call nsz float @llvm.maximum.f32(float %a, float %b) + ret float %res +} + +define float @min_nsz(float %a, float %b) { +; SSE-LABEL: min_nsz: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: cmpunordss %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: andps %xmm0, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: orps %xmm3, %xmm2 +; SSE-NEXT: minss %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: min_nsz: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: min_nsz: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %res = call nsz float @llvm.minimum.f32(float %a, float %b) + ret float %res +} + +define <2 x float> @maxv2_nsz(<2 x float> %a, <2 x float> %b) { +; SSE2-LABEL: maxv2_nsz: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: maxps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: maxv2_nsz: +; SSE4: # %bb.0: # %entry +; SSE4-NEXT: movaps %xmm0, %xmm2 +; SSE4-NEXT: cmpunordps %xmm0, %xmm0 +; SSE4-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE4-NEXT: maxps %xmm1, %xmm2 +; SSE4-NEXT: movaps %xmm2, %xmm0 +; SSE4-NEXT: retq +; +; AVX-LABEL: maxv2_nsz: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %res = call nsz <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) + ret <2 x float> %res +} + +define <2 x float> @minv2_nsz(<2 x float> %a, <2 x float> %b) { +; SSE2-LABEL: minv2_nsz: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: minps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: minv2_nsz: +; SSE4: # %bb.0: # %entry +; SSE4-NEXT: movaps %xmm0, %xmm2 +; SSE4-NEXT: cmpunordps %xmm0, %xmm0 +; SSE4-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE4-NEXT: minps %xmm1, %xmm2 +; SSE4-NEXT: movaps %xmm2, %xmm0 +; SSE4-NEXT: retq +; +; AVX-LABEL: minv2_nsz: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %res = call nsz <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) + ret <2 x float> %res +} + +; =================== nan + nsz ============================= +define float @max_nsz_nnan(float %a, float %b) { +; SSE-LABEL: max_nsz_nnan: +; SSE: # %bb.0: # %entry +; SSE-NEXT: maxss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: max_nsz_nnan: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %res = call nsz nnan float @llvm.maximum.f32(float %a, float %b) + ret float %res +} + +define float @min_nsz_nnan(float %a, float %b) { +; SSE-LABEL: min_nsz_nnan: +; SSE: # %bb.0: # %entry +; SSE-NEXT: minss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: min_nsz_nnan: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %res = call nsz nnan float @llvm.minimum.f32(float %a, float %b) + ret float %res +} + +define <2 x float> @maxv2_nsz_nnan(<2 x float> %a, <2 x float> %b) { +; SSE-LABEL: maxv2_nsz_nnan: +; SSE: # %bb.0: # %entry +; SSE-NEXT: maxps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: maxv2_nsz_nnan: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %res = call nsz nnan <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) + ret <2 x float> %res +} + +define <2 x float> @minv2_nsz_nnan(<2 x float> %a, <2 x float> %b) { +; SSE-LABEL: minv2_nsz_nnan: +; SSE: # %bb.0: # %entry +; SSE-NEXT: minps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: minv2_nsz_nnan: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %res = call nsz nnan <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) + ret <2 x float> %res +} + +define float @max_nsz_0(float %b, float %c) { +; SSE-LABEL: max_nsz_0: +; SSE: # %bb.0: # %entry +; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: maxss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: max_nsz_0: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +entry: + %a = fadd nnan float %c, 5. + %res = call nsz float @llvm.maximum.f32(float %a, float %b) + ret float %res +}