diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7993,7 +7993,9 @@ // If the target has FMINIMUM/FMAXIMUM but not FMINNUM/FMAXNUM use that // instead if there are no NaNs and there can't be an incompatible zero // compare: at least one operand isn't +/-0, or there are no signed-zeros. - if (Node->getFlags().hasNoNaNs() && + if ((Node->getFlags().hasNoNaNs() || + (DAG.isKnownNeverNaN(Node->getOperand(0)) && + DAG.isKnownNeverNaN(Node->getOperand(1)))) && (Node->getFlags().hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(Node->getOperand(0)) || DAG.isKnownNeverZeroFloat(Node->getOperand(1)))) { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1002,6 +1002,9 @@ addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); + setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom); + setOperationAction(ISD::FMINIMUM, MVT::f32, Custom); + setOperationAction(ISD::FNEG, MVT::v4f32, Custom); setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom); @@ -1038,6 +1041,9 @@ addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); + setOperationAction(ISD::FMAXIMUM, MVT::f64, Custom); + setOperationAction(ISD::FMINIMUM, MVT::f64, Custom); + for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v2i32 }) { setOperationAction(ISD::SDIV, VT, Custom); @@ -2124,6 +2130,8 @@ setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal); setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom); + setOperationAction(ISD::FMINIMUM, MVT::f16, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); @@ -30194,6 +30202,127 @@ return SDValue(); } +static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) && + "Expected FMAXIMUM or FMINIMUM opcode"); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = Op.getValueType(); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + SDLoc DL(Op); + uint64_t SizeInBits = VT.getFixedSizeInBits(); + APInt PreferredZero = APInt::getZero(SizeInBits); + EVT IVT = MVT::getIntegerVT(SizeInBits); + X86ISD::NodeType MinMaxOp; + if (Op.getOpcode() == ISD::FMAXIMUM) { + MinMaxOp = X86ISD::FMAX; + } else { + PreferredZero.setSignBit(); + MinMaxOp = X86ISD::FMIN; + } + EVT SetCCType = + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + + // The tables below show the expected result of Max in cases of NaN and + // signed zeros. + // + // Y Y + // Num xNaN +0 -0 + // --------------- --------------- + // Num | Max | qNaN | +0 | +0 | +0 | + // X --------------- X --------------- + // xNaN | qNaN | qNaN | -0 | +0 | -0 | + // --------------- --------------- + // + // It is achieved by means of FMAX/FMIN with preliminary checks and operand + // reordering. + // + // We check if any of operands is NaN and return NaN. Then we check if any of + // operands is zero or negative zero (for fmaximum and fminimum respectively) + // to ensure the correct zero is returned. + auto IsPreferredZero = [PreferredZero](SDValue Op) { + Op = peekThroughBitcasts(Op); + if (auto *CstOp = dyn_cast(Op)) + return CstOp->getValueAPF().bitcastToAPInt() == PreferredZero; + if (auto *CstOp = dyn_cast(Op)) + return CstOp->getAPIntValue() == PreferredZero; + return false; + }; + + SDValue MinMax; + bool IsXNeverNaN = DAG.isKnownNeverNaN(X); + bool IsYNeverNaN = DAG.isKnownNeverNaN(Y); + if (DAG.getTarget().Options.NoSignedZerosFPMath || + Op->getFlags().hasNoSignedZeros() || IsPreferredZero(Y) || + DAG.isKnownNeverZeroFloat(X)) { + MinMax = DAG.getNode(MinMaxOp, DL, VT, X, Y, Op->getFlags()); + } else if (IsPreferredZero(X) || DAG.isKnownNeverZeroFloat(Y)) { + MinMax = DAG.getNode(MinMaxOp, DL, VT, Y, X, Op->getFlags()); + } else if ((VT == MVT::f16 || Subtarget.hasDQI()) && + (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) { + if (IsXNeverNaN) + std::swap(X, Y); + // VFPCLASSS consumes a vector type. So provide a minimal one corresponded + // xmm register. + MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits); + SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorType, X); + // Bits of classes: + // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7] + // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN + SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101, + DL, MVT::i32); + SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm); + SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, + DAG.getConstant(0, DL, MVT::v8i1), IsNanZero, + DAG.getIntPtrConstant(0, DL)); + SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins); + SDValue NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X); + SDValue NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y); + return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags()); + } else { + SDValue IsXZero; + if (Subtarget.is64Bit() || VT != MVT::f64) { + SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X); + SDValue ZeroCst = DAG.getConstant(PreferredZero, DL, IVT); + IsXZero = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETEQ); + } else { + assert(VT == MVT::f64); + SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64, + DAG.getConstantFP(0, DL, MVT::v2f64), X, + DAG.getIntPtrConstant(0, DL)); + SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins); + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX, + DAG.getIntPtrConstant(0, DL)); + Lo = DAG.getBitcast(MVT::i32, Lo); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX, + DAG.getIntPtrConstant(1, DL)); + Hi = DAG.getBitcast(MVT::i32, Hi); + PreferredZero = APInt::getZero(SizeInBits / 2); + if (MinMaxOp == X86ISD::FMIN) + PreferredZero.setSignBit(); + IsXZero = DAG.getNode(ISD::XOR, DL, MVT::i32, Hi, + DAG.getConstant(PreferredZero, DL, MVT::i32)); + IsXZero = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, IsXZero); + IsXZero = DAG.getSetCC(DL, SetCCType, IsXZero, + DAG.getConstant(0, DL, MVT::i32), ISD::SETEQ); + } + SDValue NewX = DAG.getSelect(DL, VT, IsXZero, Y, X); + SDValue NewY = DAG.getSelect(DL, VT, IsXZero, X, Y); + MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags()); + } + + if (Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN)) { + return MinMax; + } + + APFloat NaNValue = APFloat::getNaN(DAG.EVTToAPFloatSemantics(VT)); + SDValue IsNaN = DAG.getSetCC(DL, SetCCType, IsXNeverNaN ? Y : X, + IsYNeverNaN ? X : Y, ISD::SETUO); + return DAG.getSelect(DL, VT, IsNaN, DAG.getConstantFP(NaNValue, DL, VT), + MinMax); +} + static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); @@ -33934,6 +34063,9 @@ case ISD::SMIN: case ISD::UMAX: case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG); + case ISD::FMINIMUM: + case ISD::FMAXIMUM: + return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG); case ISD::ABS: return LowerABS(Op, Subtarget, DAG); case ISD::ABDS: case ISD::ABDU: return LowerABD(Op, Subtarget, DAG); diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll --- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll +++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll @@ -206,23 +206,23 @@ define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) { ; THRU-LABEL: 'fmaximum' -; THRU-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; THRU-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) +; THRU-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) +; THRU-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; LATE-LABEL: 'fmaximum' -; LATE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; LATE-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) +; LATE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) +; LATE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) ; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'fmaximum' -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; SIZE-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) +; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) +; SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE_LATE-LABEL: 'fmaximum' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %s = call float @llvm.maximum.f32(float %a, float %b) diff --git a/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll @@ -0,0 +1,229 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s + +declare half @llvm.minimum.f16(half, half) +declare half @llvm.maximum.f16(half, half) +declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>) +declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>) + +define half @test_fminimum(half %x, half %y) { +; CHECK-LABEL: test_fminimum: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %xmm0, %eax +; CHECK-NEXT: movzwl %ax, %eax +; CHECK-NEXT: cmpl $32768, %eax # imm = 0x8000 +; CHECK-NEXT: sete %al +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vmovaps %xmm0, %xmm2 +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vcmpunordsh %xmm1, %xmm0, %k2 +; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vminsh %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k2} +; CHECK-NEXT: retq + %z = call half @llvm.minimum.f16(half %x, half %y) + ret half %z +} + +define <8 x half> @test_fminimum_scalarize(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { +; CHECK-LABEL: test_fminimum_scalarize: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vminsh %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3,3,3] +; CHECK-NEXT: vminsh %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vminsh %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] +; CHECK-NEXT: vminsh %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-NEXT: vpsrlq $48, %xmm1, %xmm3 +; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm4 +; CHECK-NEXT: vminsh %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; CHECK-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-NEXT: vminsh %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vpsrld $16, %xmm1, %xmm1 +; CHECK-NEXT: vpsrld $16, %xmm0, %xmm0 +; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: retq + %r = call <8 x half> @llvm.minimum.v8f16(<8 x half> %x, <8 x half> %y) + ret <8 x half> %r +} + +define half @test_fminimum_nnan(half %x, half %y) "no-nans-fp-math"="true" { +; CHECK-LABEL: test_fminimum_nnan: +; CHECK: # %bb.0: +; CHECK-NEXT: vfpclasssh $5, %xmm1, %k1 +; CHECK-NEXT: vmovaps %xmm0, %xmm2 +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vminsh %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %1 = tail call half @llvm.minimum.f16(half %x, half %y) + ret half %1 +} + +define half @test_fminimum_zero(half %x, half %y) { +; CHECK-LABEL: test_fminimum_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: vcmpunordsh %xmm1, %xmm1, %k1 +; CHECK-NEXT: vminsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; CHECK-NEXT: vmovsh %xmm2, %xmm0, %xmm0 {%k1} +; CHECK-NEXT: retq + %1 = tail call half @llvm.minimum.f16(half -0.0, half %y) + ret half %1 +} + +define half @test_fminimum_nsz(half %x, half %y) { +; CHECK-LABEL: test_fminimum_nsz: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpunordsh %xmm1, %xmm0, %k1 +; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} +; CHECK-NEXT: retq + %1 = tail call nsz half @llvm.minimum.f16(half %x, half %y) + ret half %1 +} + +define half @test_fminimum_combine_cmps(half %x, half %y) { +; CHECK-LABEL: test_fminimum_combine_cmps: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivsh %xmm0, %xmm1, %xmm1 +; CHECK-NEXT: vfpclasssh $5, %xmm0, %k1 +; CHECK-NEXT: vmovaps %xmm1, %xmm2 +; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} +; CHECK-NEXT: vminsh %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = fdiv nnan half %y, %x + %2 = tail call half @llvm.minimum.f16(half %x, half %1) + ret half %2 +} + +define half @test_fmaximum(half %x, half %y) { +; CHECK-LABEL: test_fmaximum: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %xmm0, %eax +; CHECK-NEXT: testw %ax, %ax +; CHECK-NEXT: sete %al +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vmovaps %xmm0, %xmm2 +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vcmpunordsh %xmm1, %xmm0, %k2 +; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmaxsh %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k2} +; CHECK-NEXT: retq + %r = call half @llvm.maximum.f16(half %x, half %y) + ret half %r +} + +define <8 x half> @test_fmaximum_scalarize(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { +; CHECK-LABEL: test_fmaximum_scalarize: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vmaxsh %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3,3,3] +; CHECK-NEXT: vmaxsh %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vmaxsh %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] +; CHECK-NEXT: vmaxsh %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-NEXT: vpsrlq $48, %xmm1, %xmm3 +; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm4 +; CHECK-NEXT: vmaxsh %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; CHECK-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-NEXT: vmaxsh %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vpsrld $16, %xmm1, %xmm1 +; CHECK-NEXT: vpsrld $16, %xmm0, %xmm0 +; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: retq + %r = call <8 x half> @llvm.maximum.v8f16(<8 x half> %x, <8 x half> %y) + ret <8 x half> %r +} + +define half @test_fmaximum_nnan(half %x, half %y) { +; CHECK-LABEL: test_fmaximum_nnan: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vfpclasssh $3, %xmm0, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm1 +; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovsh %xmm2, %xmm0, %xmm0 {%k1} +; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = fadd nnan half %x, %y + %2 = fsub nnan half %x, %y + %3 = tail call half @llvm.maximum.f16(half %1, half %2) + ret half %3 +} + +define half @test_fmaximum_zero(half %x, half %y) { +; CHECK-LABEL: test_fmaximum_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmaxsh %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: vcmpunordsh %xmm1, %xmm1, %k1 +; CHECK-NEXT: vmovsh %xmm2, %xmm0, %xmm0 {%k1} +; CHECK-NEXT: retq + %1 = tail call half @llvm.maximum.f16(half 0.0, half %y) + ret half %1 +} + +define half @test_fmaximum_nsz(half %x, half %y) "no-signed-zeros-fp-math"="true" { +; CHECK-LABEL: test_fmaximum_nsz: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpunordsh %xmm1, %xmm0, %k1 +; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} +; CHECK-NEXT: retq + %1 = tail call half @llvm.maximum.f16(half %x, half %y) + ret half %1 +} + +define half @test_fmaximum_combine_cmps(half %x, half %y) { +; CHECK-LABEL: test_fmaximum_combine_cmps: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivsh %xmm0, %xmm1, %xmm1 +; CHECK-NEXT: vfpclasssh $3, %xmm0, %k1 +; CHECK-NEXT: vmovaps %xmm1, %xmm2 +; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} +; CHECK-NEXT: vmaxsh %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = fdiv nnan half %y, %x + %2 = tail call half @llvm.maximum.f16(half %x, half %1) + ret half %2 +} diff --git a/llvm/test/CodeGen/X86/extract-fp.ll b/llvm/test/CodeGen/X86/extract-fp.ll --- a/llvm/test/CodeGen/X86/extract-fp.ll +++ b/llvm/test/CodeGen/X86/extract-fp.ll @@ -105,17 +105,41 @@ ret double %r } -;define double @ext_maximum_v4f64(<2 x double> %x) nounwind { -; %v = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> ) -; %r = extractelement <2 x double> %v, i32 1 -; ret double %r -;} +define double @ext_maximum_v4f64(<2 x double> %x) nounwind { +; CHECK-LABEL: ext_maximum_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: maxsd %xmm0, %xmm1 +; CHECK-NEXT: cmpunordsd %xmm0, %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: andpd %xmm0, %xmm2 +; CHECK-NEXT: andnpd %xmm1, %xmm0 +; CHECK-NEXT: orpd %xmm2, %xmm0 +; CHECK-NEXT: retq + %v = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> ) + %r = extractelement <2 x double> %v, i32 1 + ret double %r +} -;define float @ext_minimum_v4f32(<4 x float> %x) nounwind { -; %v = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> ) -; %r = extractelement <4 x float> %v, i32 1 -; ret float %r -;} +define float @ext_minimum_v4f32(<4 x float> %x) nounwind { +; CHECK-LABEL: ext_minimum_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: minss %xmm0, %xmm1 +; CHECK-NEXT: cmpunordss %xmm0, %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: andps %xmm0, %xmm2 +; CHECK-NEXT: andnps %xmm1, %xmm0 +; CHECK-NEXT: orps %xmm2, %xmm0 +; CHECK-NEXT: retq + %v = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> ) + %r = extractelement <4 x float> %v, i32 1 + ret float %r +} declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) +declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>) +declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>) diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -672,29 +672,205 @@ ret double %r } -;define float @fmaximum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { -; %v = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y) -; %r = extractelement <4 x float> %v, i32 0 -; ret float %r -;} +define float @fmaximum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { +; X64-LABEL: fmaximum_v4f32: +; X64: # %bb.0: +; X64-NEXT: vmovd %xmm0, %eax +; X64-NEXT: testl %eax, %eax +; X64-NEXT: je .LBB30_1 +; X64-NEXT: # %bb.2: +; X64-NEXT: vmovdqa %xmm1, %xmm2 +; X64-NEXT: vmovdqa %xmm0, %xmm3 +; X64-NEXT: jmp .LBB30_3 +; X64-NEXT: .LBB30_1: +; X64-NEXT: vmovdqa %xmm0, %xmm2 +; X64-NEXT: vmovdqa %xmm1, %xmm3 +; X64-NEXT: .LBB30_3: +; X64-NEXT: vmaxss %xmm2, %xmm3, %xmm2 +; X64-NEXT: vcmpunordss %xmm1, %xmm0, %xmm0 +; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: fmaximum_v4f32: +; X86: # %bb.0: +; X86-NEXT: vmovd %xmm0, %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: je .LBB30_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovdqa %xmm1, %xmm2 +; X86-NEXT: vmovdqa %xmm0, %xmm3 +; X86-NEXT: jmp .LBB30_3 +; X86-NEXT: .LBB30_1: +; X86-NEXT: vmovdqa %xmm0, %xmm2 +; X86-NEXT: vmovdqa %xmm1, %xmm3 +; X86-NEXT: .LBB30_3: +; X86-NEXT: pushl %eax +; X86-NEXT: vmaxss %xmm2, %xmm3, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm0, %xmm0 +; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: retl + %v = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y) + %r = extractelement <4 x float> %v, i32 0 + ret float %r +} -;define double @fmaximum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { -; %v = call <4 x double> @llvm.maximum.v4f64(<4 x double> %x, <4 x double> %y) -; %r = extractelement <4 x double> %v, i32 0 -; ret double %r -;} +define double @fmaximum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { +; X64-LABEL: fmaximum_v4f64: +; X64: # %bb.0: +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: testq %rax, %rax +; X64-NEXT: je .LBB31_1 +; X64-NEXT: # %bb.2: +; X64-NEXT: vmovdqa %xmm1, %xmm2 +; X64-NEXT: vmovdqa %xmm0, %xmm3 +; X64-NEXT: jmp .LBB31_3 +; X64-NEXT: .LBB31_1: +; X64-NEXT: vmovdqa %xmm0, %xmm2 +; X64-NEXT: vmovdqa %xmm1, %xmm3 +; X64-NEXT: .LBB31_3: +; X64-NEXT: vmaxsd %xmm2, %xmm3, %xmm2 +; X64-NEXT: vcmpunordsd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: fmaximum_v4f64: +; X86: # %bb.0: +; X86-NEXT: vpextrd $1, %xmm0, %eax +; X86-NEXT: vmovd %xmm0, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: je .LBB31_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovdqa %xmm1, %xmm2 +; X86-NEXT: vmovdqa %xmm0, %xmm3 +; X86-NEXT: jmp .LBB31_3 +; X86-NEXT: .LBB31_1: +; X86-NEXT: vmovdqa %xmm0, %xmm2 +; X86-NEXT: vmovdqa %xmm1, %xmm3 +; X86-NEXT: .LBB31_3: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vmaxsd %xmm2, %xmm3, %xmm2 +; X86-NEXT: vcmpunordsd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm0 +; X86-NEXT: vmovlpd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %v = call <4 x double> @llvm.maximum.v4f64(<4 x double> %x, <4 x double> %y) + %r = extractelement <4 x double> %v, i32 0 + ret double %r +} -;define float @fminimum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { -; %v = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y) -; %r = extractelement <4 x float> %v, i32 0 -; ret float %r -;} +define float @fminimum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { +; X64-LABEL: fminimum_v4f32: +; X64: # %bb.0: +; X64-NEXT: vmovd %xmm0, %eax +; X64-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 +; X64-NEXT: je .LBB32_1 +; X64-NEXT: # %bb.2: +; X64-NEXT: vmovdqa %xmm1, %xmm2 +; X64-NEXT: vmovdqa %xmm0, %xmm3 +; X64-NEXT: jmp .LBB32_3 +; X64-NEXT: .LBB32_1: +; X64-NEXT: vmovdqa %xmm0, %xmm2 +; X64-NEXT: vmovdqa %xmm1, %xmm3 +; X64-NEXT: .LBB32_3: +; X64-NEXT: vminss %xmm2, %xmm3, %xmm2 +; X64-NEXT: vcmpunordss %xmm1, %xmm0, %xmm0 +; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: fminimum_v4f32: +; X86: # %bb.0: +; X86-NEXT: vmovd %xmm0, %eax +; X86-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 +; X86-NEXT: je .LBB32_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovdqa %xmm1, %xmm2 +; X86-NEXT: vmovdqa %xmm0, %xmm3 +; X86-NEXT: jmp .LBB32_3 +; X86-NEXT: .LBB32_1: +; X86-NEXT: vmovdqa %xmm0, %xmm2 +; X86-NEXT: vmovdqa %xmm1, %xmm3 +; X86-NEXT: .LBB32_3: +; X86-NEXT: pushl %eax +; X86-NEXT: vminss %xmm2, %xmm3, %xmm2 +; X86-NEXT: vcmpunordss %xmm1, %xmm0, %xmm0 +; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: retl + %v = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y) + %r = extractelement <4 x float> %v, i32 0 + ret float %r +} -;define double @fminimum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { -; %v = call <4 x double> @llvm.minimum.v4f64(<4 x double> %x, <4 x double> %y) -; %r = extractelement <4 x double> %v, i32 0 -; ret double %r -;} +define double @fminimum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { +; X64-LABEL: fminimum_v4f64: +; X64: # %bb.0: +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: je .LBB33_1 +; X64-NEXT: # %bb.2: +; X64-NEXT: vmovdqa %xmm1, %xmm2 +; X64-NEXT: vmovdqa %xmm0, %xmm3 +; X64-NEXT: jmp .LBB33_3 +; X64-NEXT: .LBB33_1: +; X64-NEXT: vmovdqa %xmm0, %xmm2 +; X64-NEXT: vmovdqa %xmm1, %xmm3 +; X64-NEXT: .LBB33_3: +; X64-NEXT: vminsd %xmm2, %xmm3, %xmm2 +; X64-NEXT: vcmpunordsd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: fminimum_v4f64: +; X86: # %bb.0: +; X86-NEXT: vmovd %xmm0, %eax +; X86-NEXT: vpextrd $1, %xmm0, %ecx +; X86-NEXT: addl $-2147483648, %ecx # imm = 0x80000000 +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: je .LBB33_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovdqa %xmm1, %xmm2 +; X86-NEXT: vmovdqa %xmm0, %xmm3 +; X86-NEXT: jmp .LBB33_3 +; X86-NEXT: .LBB33_1: +; X86-NEXT: vmovdqa %xmm0, %xmm2 +; X86-NEXT: vmovdqa %xmm1, %xmm3 +; X86-NEXT: .LBB33_3: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vminsd %xmm2, %xmm3, %xmm2 +; X86-NEXT: vcmpunordsd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm0 +; X86-NEXT: vmovlpd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: vzeroupper +; X86-NEXT: retl + %v = call <4 x double> @llvm.minimum.v4f64(<4 x double> %x, <4 x double> %y) + %r = extractelement <4 x double> %v, i32 0 + ret double %r +} define float @maxps_v4f32(<4 x float> %x, <4 x float> %y) nounwind { ; X64-LABEL: maxps_v4f32: diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll @@ -0,0 +1,1058 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86 + +declare float @llvm.maximum.f32(float, float) +declare double @llvm.maximum.f64(double, double) +declare float @llvm.minimum.f32(float, float) +declare double @llvm.minimum.f64(double, double) +declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>) +declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>) + +; +; fmaximum +; + +define float @test_fmaximum(float %x, float %y) { +; SSE2-LABEL: test_fmaximum: +; SSE2: # %bb.0: +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: je .LBB0_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: .LBB0_2: +; SSE2-NEXT: maxss %xmm3, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximum: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa %xmm1, %xmm3 +; AVX1-NEXT: je .LBB0_2 +; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: vmovdqa %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa %xmm0, %xmm3 +; AVX1-NEXT: .LBB0_2: +; AVX1-NEXT: vmaxss %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fmaximum: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sete %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqa %xmm0, %xmm2 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcmpunordss %xmm1, %xmm0, %k2 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmaxss %xmm1, %xmm2, %xmm0 +; AVX512-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} +; AVX512-NEXT: retq +; +; X86-LABEL: test_fmaximum: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vmovd %xmm1, %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: vmovdqa %xmm1, %xmm2 +; X86-NEXT: vmovdqa %xmm0, %xmm3 +; X86-NEXT: je .LBB0_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: vmovdqa %xmm0, %xmm2 +; X86-NEXT: vmovdqa %xmm1, %xmm3 +; X86-NEXT: .LBB0_2: +; X86-NEXT: vmaxss %xmm2, %xmm3, %xmm2 +; X86-NEXT: vcmpunordss %xmm0, %xmm1, %xmm0 +; X86-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl + %1 = tail call float @llvm.maximum.f32(float %x, float %y) + ret float %1 +} + +define <4 x float> @test_fmaximum_scalarize(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { +; SSE2-LABEL: test_fmaximum_scalarize: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] +; SSE2-NEXT: maxss %xmm2, %xmm3 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE2-NEXT: maxss %xmm2, %xmm4 +; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fmaximum_scalarize: +; AVX: # %bb.0: +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; AVX-NEXT: retq +; +; X86-LABEL: test_fmaximum_scalarize: +; X86: # %bb.0: +; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm2 +; X86-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; X86-NEXT: vmaxss %xmm3, %xmm4, %xmm3 +; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; X86-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; X86-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; X86-NEXT: vmaxss %xmm3, %xmm4, %xmm3 +; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; X86-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; X86-NEXT: retl + %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y) + ret <4 x float> %r +} + +define float @test_fmaximum_nan0(float %x, float %y) { +; SSE2-LABEL: test_fmaximum_nan0: +; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fmaximum_nan0: +; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: retq +; +; X86-LABEL: test_fmaximum_nan0: +; X86: # %bb.0: +; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X86-NEXT: retl + %1 = tail call float @llvm.maximum.f32(float 0x7fff000000000000, float %y) + ret float %1 +} + +define float @test_fmaximum_nan1(float %x, float %y) { +; SSE2-LABEL: test_fmaximum_nan1: +; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fmaximum_nan1: +; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: retq +; +; X86-LABEL: test_fmaximum_nan1: +; X86: # %bb.0: +; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X86-NEXT: retl + %1 = tail call float @llvm.maximum.f32(float %x, float 0x7fff000000000000) + ret float %1 +} + +define float @test_fmaximum_nnan(float %x, float %y) { +; SSE2-LABEL: test_fmaximum_nnan: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: addss %xmm1, %xmm0 +; SSE2-NEXT: subss %xmm1, %xmm2 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: je .LBB4_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: maxss %xmm2, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB4_1: +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximum_nnan: +; AVX1: # %bb.0: +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vmovd %xmm2, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: je .LBB4_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB4_1: +; AVX1-NEXT: vmovaps %xmm2, %xmm0 +; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512F-LABEL: test_fmaximum_nnan: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm2, %eax +; AVX512F-NEXT: testl %eax, %eax +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovaps %xmm0, %xmm1 +; AVX512F-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512F-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512F-NEXT: vmaxss %xmm1, %xmm2, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_fmaximum_nnan: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; AVX512DQ-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vfpclassss $3, %xmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %k1 +; AVX512DQ-NEXT: vmovaps %xmm2, %xmm1 +; AVX512DQ-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512DQ-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512DQ-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; X86-LABEL: test_fmaximum_nnan: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: vaddss %xmm1, %xmm2, %xmm0 +; X86-NEXT: vsubss %xmm1, %xmm2, %xmm2 +; X86-NEXT: vmovd %xmm0, %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: je .LBB4_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovaps %xmm2, %xmm1 +; X86-NEXT: jmp .LBB4_3 +; X86-NEXT: .LBB4_1: +; X86-NEXT: vmovaps %xmm0, %xmm1 +; X86-NEXT: vmovaps %xmm2, %xmm0 +; X86-NEXT: .LBB4_3: +; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl + %1 = fadd nnan float %x, %y + %2 = fsub nnan float %x, %y + %3 = tail call float @llvm.maximum.f32(float %1, float %2) + ret float %3 +} + +define double @test_fmaximum_zero0(double %x, double %y) { +; SSE2-LABEL: test_fmaximum_zero0: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: xorpd %xmm3, %xmm3 +; SSE2-NEXT: maxsd %xmm3, %xmm1 +; SSE2-NEXT: andnpd %xmm1, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximum_zero0: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fmaximum_zero0: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vmovsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} +; AVX512-NEXT: retq +; +; X86-LABEL: test_fmaximum_zero0: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; X86-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 +; X86-NEXT: vmovlpd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl + %1 = tail call double @llvm.maximum.f64(double 0.0, double %y) + ret double %1 +} + +define double @test_fmaximum_zero1(double %x, double %y) { +; SSE2-LABEL: test_fmaximum_zero1: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: andpd %xmm1, %xmm2 +; SSE2-NEXT: xorpd %xmm3, %xmm3 +; SSE2-NEXT: maxsd %xmm3, %xmm0 +; SSE2-NEXT: andnpd %xmm0, %xmm1 +; SSE2-NEXT: orpd %xmm2, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximum_zero1: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fmaximum_zero1: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, %xmm0 +; AVX512-NEXT: retq +; +; X86-LABEL: test_fmaximum_zero1: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; X86-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 +; X86-NEXT: vmovlpd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl + %1 = tail call double @llvm.maximum.f64(double %x, double 0.0) + ret double %1 +} + +define double @test_fmaximum_zero2(double %x, double %y) { +; SSE2-LABEL: test_fmaximum_zero2: +; SSE2: # %bb.0: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fmaximum_zero2: +; AVX: # %bb.0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_fmaximum_zero2: +; X86: # %bb.0: +; X86-NEXT: fldz +; X86-NEXT: retl + %1 = tail call double @llvm.maximum.f64(double 0.0, double -0.0) + ret double %1 +} + +define float @test_fmaximum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="true" { +; SSE2-LABEL: test_fmaximum_nsz: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximum_nsz: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpunordss %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fmaximum_nsz: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpunordss %xmm1, %xmm0, %k1 +; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} +; AVX512-NEXT: retq +; +; X86-LABEL: test_fmaximum_nsz: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vcmpunordss %xmm0, %xmm1, %xmm2 +; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl + %1 = tail call float @llvm.maximum.f32(float %x, float %y) + ret float %1 +} + +define float @test_fmaximum_combine_cmps(float %x, float %y) { +; SSE2-LABEL: test_fmaximum_combine_cmps: +; SSE2: # %bb.0: +; SSE2-NEXT: divss %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: je .LBB9_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: jmp .LBB9_3 +; SSE2-NEXT: .LBB9_1: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: .LBB9_3: +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: cmpunordss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fmaximum_combine_cmps: +; AVX1: # %bb.0: +; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: je .LBB9_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vmovaps %xmm1, %xmm2 +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: jmp .LBB9_3 +; AVX1-NEXT: .LBB9_1: +; AVX1-NEXT: vmovaps %xmm0, %xmm2 +; AVX1-NEXT: .LBB9_3: +; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512F-LABEL: test_fmaximum_combine_cmps: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: testl %eax, %eax +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovaps %xmm1, %xmm2 +; AVX512F-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k2 +; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: vmaxss %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_fmaximum_combine_cmps: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX512DQ-NEXT: vfpclassss $3, %xmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %k1 +; AVX512DQ-NEXT: vmovaps %xmm1, %xmm2 +; AVX512DQ-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512DQ-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512DQ-NEXT: vmaxss %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; X86-LABEL: test_fmaximum_combine_cmps: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; X86-NEXT: vmovd %xmm0, %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: je .LBB9_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovaps %xmm1, %xmm2 +; X86-NEXT: vmovaps %xmm0, %xmm1 +; X86-NEXT: jmp .LBB9_3 +; X86-NEXT: .LBB9_1: +; X86-NEXT: vmovaps %xmm0, %xmm2 +; X86-NEXT: .LBB9_3: +; X86-NEXT: vmaxss %xmm2, %xmm1, %xmm1 +; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X86-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl + %1 = fdiv nnan float %y, %x + %2 = tail call float @llvm.maximum.f32(float %x, float %1) + ret float %2 +} + +; +; fminimum +; + +define float @test_fminimum(float %x, float %y) { +; SSE2-LABEL: test_fminimum: +; SSE2: # %bb.0: +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: je .LBB10_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: .LBB10_2: +; SSE2-NEXT: minss %xmm3, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fminimum: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa %xmm1, %xmm3 +; AVX1-NEXT: je .LBB10_2 +; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: vmovdqa %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa %xmm0, %xmm3 +; AVX1-NEXT: .LBB10_2: +; AVX1-NEXT: vminss %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fminimum: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 +; AVX512-NEXT: sete %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqa %xmm0, %xmm2 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcmpunordss %xmm1, %xmm0, %k2 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vminss %xmm1, %xmm2, %xmm0 +; AVX512-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} +; AVX512-NEXT: retq +; +; X86-LABEL: test_fminimum: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vmovd %xmm1, %eax +; X86-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 +; X86-NEXT: vmovdqa %xmm1, %xmm2 +; X86-NEXT: vmovdqa %xmm0, %xmm3 +; X86-NEXT: je .LBB10_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: vmovdqa %xmm0, %xmm2 +; X86-NEXT: vmovdqa %xmm1, %xmm3 +; X86-NEXT: .LBB10_2: +; X86-NEXT: vminss %xmm2, %xmm3, %xmm2 +; X86-NEXT: vcmpunordss %xmm0, %xmm1, %xmm0 +; X86-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl + %1 = tail call float @llvm.minimum.f32(float %x, float %y) + ret float %1 +} + +define <2 x double> @test_fminimum_scalarize(<2 x double> %x, <2 x double> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { +; SSE2-LABEL: test_fminimum_scalarize: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: minsd %xmm1, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: minsd %xmm1, %xmm0 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimum_scalarize: +; AVX: # %bb.0: +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-NEXT: retq +; +; X86-LABEL: test_fminimum_scalarize: +; X86: # %bb.0: +; X86-NEXT: vminsd %xmm1, %xmm0, %xmm2 +; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; X86-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; X86-NEXT: retl + %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> %y) + ret <2 x double> %r +} + +define float @test_fminimum_nan0(float %x, float %y) { +; SSE2-LABEL: test_fminimum_nan0: +; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimum_nan0: +; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: retq +; +; X86-LABEL: test_fminimum_nan0: +; X86: # %bb.0: +; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X86-NEXT: retl + %1 = tail call float @llvm.minimum.f32(float 0x7fff000000000000, float %y) + ret float %1 +} + +define float @test_fminimum_nan1(float %x, float %y) { +; SSE2-LABEL: test_fminimum_nan1: +; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimum_nan1: +; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: retq +; +; X86-LABEL: test_fminimum_nan1: +; X86: # %bb.0: +; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X86-NEXT: retl + %1 = tail call float @llvm.minimum.f32(float %x, float 0x7fff000000000000) + ret float %1 +} + +define double @test_fminimum_nnan(double %x, double %y) "no-nans-fp-math"="true" { +; SSE2-LABEL: test_fminimum_nnan: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; SSE2-NEXT: cmpq %rcx, %rax +; SSE2-NEXT: je .LBB14_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: minsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB14_1: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: minsd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fminimum_nnan: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX1-NEXT: cmpq %rcx, %rax +; AVX1-NEXT: je .LBB14_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB14_1: +; AVX1-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-NEXT: vminsd %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512F-LABEL: test_fminimum_nnan: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX512F-NEXT: cmpq %rcx, %rax +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovapd %xmm1, %xmm2 +; AVX512F-NEXT: vmovsd %xmm0, %xmm2, %xmm2 {%k1} +; AVX512F-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: vminsd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_fminimum_nnan: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vfpclasssd $5, %xmm1, %k0 +; AVX512DQ-NEXT: kmovw %k0, %k1 +; AVX512DQ-NEXT: vmovapd %xmm0, %xmm2 +; AVX512DQ-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} +; AVX512DQ-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512DQ-NEXT: vminsd %xmm2, %xmm1, %xmm0 +; AVX512DQ-NEXT: retq +; +; X86-LABEL: test_fminimum_nnan: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; X86-NEXT: vmovd %xmm2, %eax +; X86-NEXT: vpextrd $1, %xmm2, %ecx +; X86-NEXT: addl $-2147483648, %ecx # imm = 0x80000000 +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: je .LBB14_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovapd %xmm1, %xmm2 +; X86-NEXT: jmp .LBB14_3 +; X86-NEXT: .LBB14_1: +; X86-NEXT: vmovapd %xmm0, %xmm2 +; X86-NEXT: vmovapd %xmm1, %xmm0 +; X86-NEXT: .LBB14_3: +; X86-NEXT: vminsd %xmm2, %xmm0, %xmm0 +; X86-NEXT: vmovsd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl + %1 = tail call double @llvm.minimum.f64(double %x, double %y) + ret double %1 +} + +define double @test_fminimum_zero0(double %x, double %y) { +; SSE2-LABEL: test_fminimum_zero0: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: andnpd %xmm1, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fminimum_zero0: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm0 +; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fminimum_zero0: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512-NEXT: vmovsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} +; AVX512-NEXT: retq +; +; X86-LABEL: test_fminimum_zero0: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 +; X86-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vmovlpd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl + %1 = tail call double @llvm.minimum.f64(double -0.0, double %y) + ret double %1 +} + +define double @test_fminimum_zero1(double %x, double %y) { +; SSE2-LABEL: test_fminimum_zero1: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: andpd %xmm1, %xmm2 +; SSE2-NEXT: minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: andnpd %xmm0, %xmm1 +; SSE2-NEXT: orpd %xmm2, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fminimum_zero1: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fminimum_zero1: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} +; AVX512-NEXT: retq +; +; X86-LABEL: test_fminimum_zero1: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 +; X86-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vmovlpd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl + %1 = tail call double @llvm.minimum.f64(double %x, double -0.0) + ret double %1 +} + +define double @test_fminimum_zero2(double %x, double %y) { +; SSE2-LABEL: test_fminimum_zero2: +; SSE2: # %bb.0: +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimum_zero2: +; AVX: # %bb.0: +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: retq +; +; X86-LABEL: test_fminimum_zero2: +; X86: # %bb.0: +; X86-NEXT: fldz +; X86-NEXT: fchs +; X86-NEXT: retl + %1 = tail call double @llvm.minimum.f64(double -0.0, double 0.0) + ret double %1 +} + +define float @test_fminimum_nsz(float %x, float %y) { +; SSE2-LABEL: test_fminimum_nsz: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fminimum_nsz: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpunordss %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_fminimum_nsz: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpunordss %xmm1, %xmm0, %k1 +; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} +; AVX512-NEXT: retq +; +; X86-LABEL: test_fminimum_nsz: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vcmpunordss %xmm0, %xmm1, %xmm2 +; X86-NEXT: vminss %xmm0, %xmm1, %xmm0 +; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl + %1 = tail call nsz float @llvm.minimum.f32(float %x, float %y) + ret float %1 +} + +define float @test_fminimum_combine_cmps(float %x, float %y) { +; SSE2-LABEL: test_fminimum_combine_cmps: +; SSE2: # %bb.0: +; SSE2-NEXT: divss %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 +; SSE2-NEXT: je .LBB19_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: jmp .LBB19_3 +; SSE2-NEXT: .LBB19_1: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: .LBB19_3: +; SSE2-NEXT: minss %xmm2, %xmm1 +; SSE2-NEXT: cmpunordss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: test_fminimum_combine_cmps: +; AVX1: # %bb.0: +; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 +; AVX1-NEXT: je .LBB19_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vmovaps %xmm1, %xmm2 +; AVX1-NEXT: vmovaps %xmm0, %xmm1 +; AVX1-NEXT: jmp .LBB19_3 +; AVX1-NEXT: .LBB19_1: +; AVX1-NEXT: vmovaps %xmm0, %xmm2 +; AVX1-NEXT: .LBB19_3: +; AVX1-NEXT: vminss %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512F-LABEL: test_fminimum_combine_cmps: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovaps %xmm1, %xmm2 +; AVX512F-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k2 +; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: vminss %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: test_fminimum_combine_cmps: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; AVX512DQ-NEXT: vfpclassss $5, %xmm0, %k0 +; AVX512DQ-NEXT: kmovw %k0, %k1 +; AVX512DQ-NEXT: vmovaps %xmm1, %xmm2 +; AVX512DQ-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512DQ-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512DQ-NEXT: vminss %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; X86-LABEL: test_fminimum_combine_cmps: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vdivss %xmm0, %xmm1, %xmm1 +; X86-NEXT: vmovd %xmm0, %eax +; X86-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 +; X86-NEXT: je .LBB19_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: vmovaps %xmm1, %xmm2 +; X86-NEXT: vmovaps %xmm0, %xmm1 +; X86-NEXT: jmp .LBB19_3 +; X86-NEXT: .LBB19_1: +; X86-NEXT: vmovaps %xmm0, %xmm2 +; X86-NEXT: .LBB19_3: +; X86-NEXT: vminss %xmm2, %xmm1, %xmm1 +; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X86-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl + %1 = fdiv nnan float %y, %x + %2 = tail call float @llvm.minimum.f32(float %x, float %1) + ret float %2 +} diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -1360,22 +1360,17 @@ define half @pr61271(half %0, half %1) #0 { ; CHECK-LIBCALL-LABEL: pr61271: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: subq $40, %rsp +; CHECK-LIBCALL-NEXT: pushq %rax ; CHECK-LIBCALL-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-LIBCALL-NEXT: movaps %xmm1, %xmm0 ; CHECK-LIBCALL-NEXT: callq __extendhfsf2@PLT -; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movss %xmm0, (%rsp) # 4-byte Spill ; CHECK-LIBCALL-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-LIBCALL-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-LIBCALL-NEXT: callq __extendhfsf2@PLT -; CHECK-LIBCALL-NEXT: movaps %xmm0, %xmm1 -; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; CHECK-LIBCALL-NEXT: cmpltss %xmm2, %xmm1 -; CHECK-LIBCALL-NEXT: andps %xmm1, %xmm0 -; CHECK-LIBCALL-NEXT: andnps %xmm2, %xmm1 -; CHECK-LIBCALL-NEXT: orps %xmm1, %xmm0 +; CHECK-LIBCALL-NEXT: minss (%rsp), %xmm0 # 4-byte Folded Reload ; CHECK-LIBCALL-NEXT: callq __truncsfhf2@PLT -; CHECK-LIBCALL-NEXT: addq $40, %rsp +; CHECK-LIBCALL-NEXT: popq %rax ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: pr61271: @@ -1388,8 +1383,7 @@ ; BWON-F16C-NEXT: movzwl %ax, %eax ; BWON-F16C-NEXT: vmovd %eax, %xmm1 ; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; BWON-F16C-NEXT: vcmpltss %xmm0, %xmm1, %xmm2 -; BWON-F16C-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; BWON-F16C-NEXT: vminss %xmm0, %xmm1, %xmm0 ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; BWON-F16C-NEXT: vmovd %xmm0, %eax ; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 @@ -1411,13 +1405,8 @@ ; CHECK-I686-NEXT: calll __extendhfsf2 ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: movaps %xmm1, %xmm2 -; CHECK-I686-NEXT: cmpltss %xmm0, %xmm2 -; CHECK-I686-NEXT: andps %xmm2, %xmm1 -; CHECK-I686-NEXT: andnps %xmm0, %xmm2 -; CHECK-I686-NEXT: orps %xmm1, %xmm2 -; CHECK-I686-NEXT: movss %xmm2, (%esp) +; CHECK-I686-NEXT: minss {{[0-9]+}}(%esp), %xmm0 +; CHECK-I686-NEXT: movss %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncsfhf2 ; CHECK-I686-NEXT: addl $44, %esp ; CHECK-I686-NEXT: retl