Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -559,10 +559,19 @@ FCEIL, FTRUNC, FRINT, FNEARBYINT, FROUND, FFLOOR, /// FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two /// values. - /// In the case where a single input is NaN, the non-NaN input is returned. + // + /// In the case where a single input is a NaN (either signaling or quiet), + /// the non-NaN input is returned. /// /// The return value of (FMINNUM 0.0, -0.0) could be either 0.0 or -0.0. FMINNUM, FMAXNUM, + + /// FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on + /// two values, following the IEEE-754 2008 definition. This differs from + /// FMINNUM/FMAXNUM in the handling of signaling NaNs. If one input is a + /// signaling NaN, returns a quiet NaN. + FMINNUM_IEEE, FMAXNUM_IEEE, + /// FMINNAN/FMAXNAN - Behave identically to FMINNUM/FMAXNUM, except that /// when a single input is NaN, NaN is returned. FMINNAN, FMAXNAN, Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -3606,6 +3606,9 @@ /// \returns True, if the expansion was successful, false otherwise bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const; + /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs. + SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const; + /// Turn load of vector type into a load of the individual elements. /// \param LD load to expand /// \returns MERGE_VALUEs of the scalar loads with their chains. Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -406,6 +406,11 @@ [SDNPCommutative, SDNPAssociative]>; def fmaxnum : SDNode<"ISD::FMAXNUM" , SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; +def fminnum_ieee : SDNode<"ISD::FMINNUM_IEEE", SDTFPBinOp, + [SDNPCommutative]>; +def fmaxnum_ieee : SDNode<"ISD::FMAXNUM_IEEE", SDTFPBinOp, + [SDNPCommutative]>; + def fminnan : SDNode<"ISD::FMINNAN" , SDTFPBinOp>; def fmaxnan : SDNode<"ISD::FMAXNAN" , SDTFPBinOp>; def fgetsign : SDNode<"ISD::FGETSIGN" , SDTFPToIntOp>; Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7070,6 +7070,13 @@ case ISD::SETLE: case ISD::SETULT: case ISD::SETULE: { + // Since it's known never nan to get here already, either fminnum or + // fminnum_ieee are OK. Try the ieee version first, since it's fminnum is + // expanded in terms of it. + unsigned IEEEOpcode = (LHS == True) ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; + if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT)) + return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS); + unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM; if (TLI.isOperationLegalOrCustom(Opcode, TransformVT)) return DAG.getNode(Opcode, DL, VT, LHS, RHS); @@ -7081,6 +7088,10 @@ case ISD::SETGE: case ISD::SETUGT: case ISD::SETUGE: { + unsigned IEEEOpcode = (LHS == True) ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE; + if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT)) + return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS); + unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM; if (TLI.isOperationLegalOrCustom(Opcode, TransformVT)) return DAG.getNode(Opcode, DL, VT, LHS, RHS); Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3253,7 +3253,12 @@ Results.push_back(Tmp1); break; } - + case ISD::FMINNUM: + case ISD::FMAXNUM: { + if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Node, DAG)) + Results.push_back(Expanded); + break; + } case ISD::FSIN: case ISD::FCOS: { EVT VT = Node->getValueType(0); Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -130,6 +130,7 @@ SDValue ExpandBITREVERSE(SDValue Op); SDValue ExpandCTLZ(SDValue Op); SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op); + SDValue ExpandFMINNUM_FMAXNUM(SDValue Op); SDValue ExpandStrictFPOp(SDValue Op); /// Implements vector promotion. @@ -362,6 +363,8 @@ case ISD::FABS: case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: case ISD::FMINNAN: case ISD::FMAXNAN: case ISD::FCOPYSIGN: @@ -736,6 +739,9 @@ return ExpandCTLZ(Op); case ISD::CTTZ_ZERO_UNDEF: return ExpandCTTZ_ZERO_UNDEF(Op); + case ISD::FMINNUM: + case ISD::FMAXNUM: + return ExpandFMINNUM_FMAXNUM(Op); case ISD::STRICT_FADD: case ISD::STRICT_FSUB: case ISD::STRICT_FMUL: @@ -1141,6 +1147,12 @@ return DAG.UnrollVectorOp(Op.getNode()); } +SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) { + if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Op.getNode(), DAG)) + return Expanded; + return DAG.UnrollVectorOp(Op.getNode()); +} + SDValue VectorLegalizer::ExpandStrictFPOp(SDValue Op) { EVT VT = Op.getValueType(); EVT EltVT = VT.getVectorElementType(); Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -113,6 +113,8 @@ case ISD::FMUL: case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: case ISD::FMINNAN: case ISD::FMAXNAN: case ISD::SMIN: Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3712,9 +3712,31 @@ // TODO: Refine on operand return false; } + case ISD::FMINNUM: + case ISD::FMAXNUM: { + // Only one needs to be known not-nan, since it will be returned if the + // other ends up being one. + return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) || + isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); + } + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: { + if (SNaN) + return true; + // This can return a NaN if either operand is an sNaN, or if both operands + // are NaN. + return (isKnownNeverNaN(Op.getOperand(0), false, Depth + 1) && + isKnownNeverSNaN(Op.getOperand(1), Depth + 1)) || + (isKnownNeverNaN(Op.getOperand(1), false, Depth + 1) && + isKnownNeverSNaN(Op.getOperand(0), Depth + 1)); + } + case ISD::FMINNAN: + case ISD::FMAXNAN: { + // TODO: Does this quiet or return the origina NaN as-is? + return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && + isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); - // TODO: Handle FMINNUM/FMAXNUM/FMINNAN/FMAXNAN when there is an agreement on - // what they should do. + } case ISD::EXTRACT_VECTOR_ELT: { return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); } Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -175,6 +175,9 @@ case ISD::FABS: return "fabs"; case ISD::FMINNUM: return "fminnum"; case ISD::FMAXNUM: return "fmaxnum"; + case ISD::FMINNUM_IEEE: return "fminnum_ieee"; + case ISD::FMAXNUM_IEEE: return "fmaxnum_ieee"; + case ISD::FMINNAN: return "fminnan"; case ISD::FMAXNAN: return "fmaxnan"; case ISD::FNEG: return "fneg"; Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4027,6 +4027,35 @@ return true; } +SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, + SelectionDAG &DAG) const { + SDLoc dl(Node); + unsigned NewOp = Node->getOpcode() == ISD::FMINNUM ? + ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; + EVT VT = Node->getValueType(0); + if (isOperationLegalOrCustom(NewOp, VT)) { + SDValue Quiet0 = Node->getOperand(0); + SDValue Quiet1 = Node->getOperand(1); + + if (!Node->getFlags().hasNoNaNs()) { + // Insert canonicalizes if it's possible we need to quiet to get correct + // sNaN behavior. + if (!DAG.isKnownNeverSNaN(Quiet0)) { + Quiet0 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet0, + Node->getFlags()); + } + if (!DAG.isKnownNeverSNaN(Quiet1)) { + Quiet1 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet1, + Node->getFlags()); + } + } + + return DAG.getNode(NewOp, dl, VT, Quiet0, Quiet1, Node->getFlags()); + } + + return SDValue(); +} + SDValue TargetLowering::scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const { SDLoc SL(LD); Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -599,6 +599,8 @@ setOperationAction(ISD::CONCAT_VECTORS, VT, Expand); setOperationAction(ISD::FMINNUM, VT, Expand); setOperationAction(ISD::FMAXNUM, VT, Expand); + setOperationAction(ISD::FMINNUM_IEEE, VT, Expand); + setOperationAction(ISD::FMAXNUM_IEEE, VT, Expand); setOperationAction(ISD::FMINNAN, VT, Expand); setOperationAction(ISD::FMAXNAN, VT, Expand); setOperationAction(ISD::FMAD, VT, Expand); Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -358,6 +358,7 @@ SIN_HW, FMAX_LEGACY, FMIN_LEGACY, + FMAX3, SMAX3, UMAX3, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -551,6 +551,8 @@ case ISD::FMAD: case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: case ISD::FSIN: case ISD::FTRUNC: case ISD::FRINT: @@ -3511,6 +3513,10 @@ return ISD::FMINNUM; case ISD::FMINNUM: return ISD::FMAXNUM; + case ISD::FMAXNUM_IEEE: + return ISD::FMINNUM_IEEE; + case ISD::FMINNUM_IEEE: + return ISD::FMAXNUM_IEEE; case AMDGPUISD::FMAX_LEGACY: return AMDGPUISD::FMIN_LEGACY; case AMDGPUISD::FMIN_LEGACY: @@ -3616,6 +3622,8 @@ } case ISD::FMAXNUM: case ISD::FMINNUM: + case ISD::FMAXNUM_IEEE: + case ISD::FMINNUM_IEEE: case AMDGPUISD::FMAX_LEGACY: case AMDGPUISD::FMIN_LEGACY: { // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y) Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -152,8 +152,14 @@ def smin_oneuse : HasOneUseBinOp; def umax_oneuse : HasOneUseBinOp; def umin_oneuse : HasOneUseBinOp; + def fminnum_oneuse : HasOneUseBinOp; def fmaxnum_oneuse : HasOneUseBinOp; + +def fminnum_ieee_oneuse : HasOneUseBinOp; +def fmaxnum_ieee_oneuse : HasOneUseBinOp; + + def and_oneuse : HasOneUseBinOp; def or_oneuse : HasOneUseBinOp; def xor_oneuse : HasOneUseBinOp; @@ -837,3 +843,25 @@ (AMDGPUrcp (fsqrt vt:$src)), (RsqInst $src) >; + +// Instructions which select to the same v_min_f* +def fminnum_like : PatFrags<(ops node:$src0, node:$src1), + [(fminnum_ieee node:$src0, node:$src1), + (fminnum node:$src0, node:$src1)] +>; + +// Instructions which select to the same v_max_f* +def fmaxnum_like : PatFrags<(ops node:$src0, node:$src1), + [(fmaxnum_ieee node:$src0, node:$src1), + (fmaxnum node:$src0, node:$src1)] +>; + +def fminnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1), + [(fminnum_ieee_oneuse node:$src0, node:$src1), + (fminnum_oneuse node:$src0, node:$src1)] +>; + +def fmaxnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1), + [(fmaxnum_ieee_oneuse node:$src0, node:$src1), + (fmaxnum_oneuse node:$src0, node:$src1)] +>; Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -108,6 +108,7 @@ /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; SDValue getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const; @@ -344,6 +345,11 @@ bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; bool denormalsEnabledForType(EVT VT) const; + + bool isKnownNeverNaNForTargetNode(SDValue Op, + const SelectionDAG &DAG, + bool SNaN = false, + unsigned Depth = 0) const override; }; } // End namespace llvm Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -384,8 +384,20 @@ if (Subtarget->hasBFE()) setHasExtractBitsInsn(true); - setOperationAction(ISD::FMINNUM, MVT::f64, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Custom); + setOperationAction(ISD::FMAXNUM, MVT::f32, Custom); + setOperationAction(ISD::FMINNUM, MVT::f64, Custom); + setOperationAction(ISD::FMAXNUM, MVT::f64, Custom); + + + // These are really only legal for ieee_mode functions. We should be avoiding + // them for functions that don't have ieee_mode enabled, so just say they are + // legal. + setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal); + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); @@ -474,8 +486,7 @@ // F16 - VOP2 Actions. setOperationAction(ISD::BR_CC, MVT::f16, Expand); setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); - setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); - setOperationAction(ISD::FMINNUM, MVT::f16, Legal); + setOperationAction(ISD::FDIV, MVT::f16, Custom); // F16 - VOP3 Actions. @@ -558,6 +569,17 @@ // This isn't really legal, but this avoids the legalizer unrolling it (and // allows matching fneg (fabs x) patterns) setOperationAction(ISD::FABS, MVT::v2f16, Legal); + + setOperationAction(ISD::FMAXNUM, MVT::f16, Custom); + setOperationAction(ISD::FMINNUM, MVT::f16, Custom); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal); + setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal); + + setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom); + + setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand); + setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand); } if (Subtarget->hasVOP3PInsts()) { @@ -575,8 +597,10 @@ setOperationAction(ISD::FADD, MVT::v2f16, Legal); setOperationAction(ISD::FMUL, MVT::v2f16, Legal); setOperationAction(ISD::FMA, MVT::v2f16, Legal); - setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal); + + setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal); + setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); @@ -596,6 +620,10 @@ setOperationAction(ISD::FADD, MVT::v4f16, Custom); setOperationAction(ISD::FMUL, MVT::v4f16, Custom); + + setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom); + setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom); + setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom); setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom); @@ -634,6 +662,8 @@ setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); + setTargetDAGCombine(ISD::FMINNUM_IEEE); + setTargetDAGCombine(ISD::FMAXNUM_IEEE); setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SMIN); setTargetDAGCombine(ISD::SMAX); @@ -3570,6 +3600,9 @@ case ISD::FNEG: case ISD::FCANONICALIZE: return splitUnaryVectorOp(Op, DAG); + case ISD::FMINNUM: + case ISD::FMAXNUM: + return lowerFMINNUM_FMAXNUM(Op, DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: @@ -3580,10 +3613,10 @@ case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: - case ISD::FMINNUM: - case ISD::FMAXNUM: case ISD::FADD: case ISD::FMUL: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: return splitBinaryVectorOp(Op, DAG); } return SDValue(); @@ -4038,6 +4071,23 @@ return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); } +SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); + + // FIXME: Assert during eslection that this is only selected for + // ieee_mode. Currently a combine can produce the ieee version for non-ieee + // mode functions, but this happens to be OK since it's only done in cases + // where there is known no sNaN. + if (IsIEEEMode) + return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); + + if (VT == MVT::v4f16) + return splitBinaryVectorOp(Op, DAG); + return Op; +} + SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Chain = Op.getOperand(0); @@ -7433,37 +7483,32 @@ case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: case AMDGPUISD::CLAMP: case AMDGPUISD::FMED3: case AMDGPUISD::FMAX3: case AMDGPUISD::FMIN3: { // FIXME: Shouldn't treat the generic operations different based these. - bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); - if (IsIEEEMode) { - // snans will be quieted, so we only need to worry about denormals. - if (Subtarget->supportsMinMaxDenormModes() || - denormalsEnabledForType(Op.getValueType())) - return true; - - // Flushing may be required. - // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such - // targets need to check their input recursively. - return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) && - isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1); - } + // However, we aren't really required to flush the result from + // minnum/maxnum.. + // snans will be quieted, so we only need to worry about denormals. if (Subtarget->supportsMinMaxDenormModes() || - denormalsEnabledForType(Op.getValueType())) { - // Only quieting may be necessary. - return DAG.isKnownNeverSNaN(Op.getOperand(0)) && - DAG.isKnownNeverSNaN(Op.getOperand(1)); + denormalsEnabledForType(Op.getValueType())) + return true; + + // Flushing may be required. + // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such + // targets need to check their input recursively. + + // FIXME: Does this apply with clamp? It's implemented with max. + for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) { + if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1)) + return false; } - // Flushing and quieting may be necessary - // With ieee_mode off, the nan is returned as-is, so if it is an sNaN it - // needs to be quieted. - return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) && - isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1); + return true; } case ISD::SELECT: { return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) && @@ -7490,6 +7535,21 @@ // Could be anything. return false; + case ISD::BITCAST: { + // Hack round the mess we make when legalizing extract_vector_elt + SDValue Src = Op.getOperand(0); + if (Src.getValueType() == MVT::i16 && + Src.getOpcode() == ISD::TRUNCATE) { + SDValue TruncSrc = Src.getOperand(0); + if (TruncSrc.getValueType() == MVT::i32 && + TruncSrc.getOpcode() == ISD::BITCAST && + TruncSrc.getOperand(0).getValueType() == MVT::v2f16) { + return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1); + } + } + + return false; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); @@ -7515,7 +7575,6 @@ } // Constant fold canonicalize. - SDValue SITargetLowering::getCanonicalConstantFP( SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { // Flush denormals to 0 if not enabled. @@ -7611,18 +7670,40 @@ } } + unsigned SrcOpc = N0.getOpcode(); + + // If it's free to do so, push canonicalizes further up the source, which may + // find a canonical source. + // + // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for + // sNaNs. + if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) { + auto *CRHS = dyn_cast(N0.getOperand(1)); + if (CRHS && N0.hasOneUse()) { + SDLoc SL(N); + SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT, + N0.getOperand(0)); + SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF()); + DCI.AddToWorklist(Canon0.getNode()); + + return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1); + } + } + return isCanonicalized(DAG, N0) ? N0 : SDValue(); } static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { switch (Opc) { case ISD::FMAXNUM: + case ISD::FMAXNUM_IEEE: return AMDGPUISD::FMAX3; case ISD::SMAX: return AMDGPUISD::SMAX3; case ISD::UMAX: return AMDGPUISD::UMAX3; case ISD::FMINNUM: + case ISD::FMINNUM_IEEE: return AMDGPUISD::FMIN3; case ISD::SMIN: return AMDGPUISD::SMIN3; @@ -7782,6 +7863,7 @@ // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || + (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) || (Opc == AMDGPUISD::FMIN_LEGACY && Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && (VT == MVT::f32 || VT == MVT::f64 || @@ -7900,7 +7982,9 @@ case ISD::SMIN: case ISD::SMAX: case ISD::FMAXNUM: - case ISD::FMINNUM: { + case ISD::FMINNUM: + case ISD::FMAXNUM_IEEE: + case ISD::FMINNUM_IEEE: { SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec.getOperand(0), Idx); SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, @@ -8500,13 +8584,15 @@ return performSetCCCombine(N, DCI); case ISD::FMAXNUM: case ISD::FMINNUM: + case ISD::FMAXNUM_IEEE: + case ISD::FMINNUM_IEEE: case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: case ISD::UMIN: case AMDGPUISD::FMIN_LEGACY: case AMDGPUISD::FMAX_LEGACY: { - if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && + if (//DCI.getDAGCombineLevel() >= AfterLegalizeDAG && getTargetMachine().getOptLevel() > CodeGenOpt::None) return performMinMaxCombine(N, DCI); break; @@ -9225,3 +9311,17 @@ return false; } } + +bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, + const SelectionDAG &DAG, + bool SNaN, + unsigned Depth) const { + if (Op.getOpcode() == AMDGPUISD::CLAMP) { + if (Subtarget->enableDX10Clamp()) + return true; // Clamped to 0. + return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); + } + + return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG, + SNaN, Depth); +} Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1611,10 +1611,11 @@ // This matches 16 permutations of // max(min(x, y), min(max(x, y), z)) class FPMed3Pat : GCNPat< - (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), - (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) @@ -1622,10 +1623,10 @@ class FP16Med3Pat : GCNPat< - (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), - (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), - (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), - (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), + (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), + (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE) >; Index: lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP2Instructions.td +++ lib/Target/AMDGPU/VOP2Instructions.td @@ -367,8 +367,8 @@ defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>; defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24>; defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>; -defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>; -defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>; +defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>; +defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>; defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_I32_I32_I32>; defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_I32_I32_I32>; defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_I32_I32_I32>; @@ -476,8 +476,8 @@ defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">; defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>; -defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>; -defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>; +defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>; +defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>; defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>; defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>; defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>; Index: lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP3Instructions.td +++ lib/Target/AMDGPU/VOP3Instructions.td @@ -294,8 +294,8 @@ def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile, fma>; def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile, fadd, 1>; def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile, fmul, 1>; -def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile, fminnum, 1>; -def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile, fmaxnum, 1>; +def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile, fminnum_like, 1>; +def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile, fmaxnum_like, 1>; } // End SchedRW = [WriteDoubleAdd] let SchedRW = [WriteQuarterRate32] in { Index: lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- lib/Target/AMDGPU/VOP3PInstructions.td +++ lib/Target/AMDGPU/VOP3PInstructions.td @@ -48,8 +48,8 @@ def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile, fadd>; def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile, fmul>; -def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile, fmaxnum>; -def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile, fminnum>; +def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile, fmaxnum_like>; +def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile, fminnum_like>; def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile, add>; def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile>; Index: test/CodeGen/AMDGPU/clamp.ll =================================================================== --- test/CodeGen/AMDGPU/clamp.ll +++ test/CodeGen/AMDGPU/clamp.ll @@ -73,9 +73,10 @@ ; matched through med3, not if directly. Is this correct? ; GCN-LABEL: {{^}}v_clamp_negzero_maybe_snan_f32: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[A]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]] +; GCN-DAG: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN-DAG: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] +; GCN-DAG: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1{{$}} +; GCN: v_med3_f32 v{{[0-9]+}}, [[QUIET_A]], [[SIGNBIT]], 1.0 define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -90,8 +91,17 @@ ; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]] +; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] +; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]] +; GCN: v_min_f32_e32 [[MED:v[0-9]+]], 1.0, [[QUIET_A]] +; GCN-NOT: [[MAX]] +; GCN-NOT: [[MED]] + +; SI: buffer_store_dword [[MED]] +; SI: buffer_store_dword [[MAX]] + +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MED]] +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX]] define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid @@ -406,8 +416,8 @@ ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]] +; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] +; GCN: v_med3_f32 {{v[0-9]+}}, [[QUIET_A]], 0, 1.0 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid Index: test/CodeGen/AMDGPU/fcanonicalize-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -451,14 +451,13 @@ } ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode: -; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} -; GFX9-NOT: v_max -; GFX9-NOT: v_mul - -; VI-DENORM-NOT: v_max_f32 -; VI-DENORM-NOT: v_mul_f32 +; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] +; GCN-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]] +; GCN-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]] +; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, [[QUIET]] -; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; GCN-NOT: v_max +; GCN-NOT: v_mul ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) { @@ -472,15 +471,13 @@ } ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode: -; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} - -; GFX9-NOT: v_max -; GFX9-NOT: v_mul - - -; VI-DENORM-NOT: v_max -; VI-DENORM-NOT: v_mul ; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; GCN-DENORM-NOT: v_max +; GCN-DENORM-NOT: v_mul + +; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN-DENORM-NOT: v_max +; GCN-DENORM-NOT: v_mul ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}] define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(float addrspace(1)* %arg) #1 { @@ -526,13 +523,19 @@ } ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32: -; GFX9: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}} +; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] + +; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]] +; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]] -; VI-FLUSH: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}} -; VI-FLUSH: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]] +; GFX9-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]] +; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]] -; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}} +; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]] +; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]] + +; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]] ; GCN-NOT: v_mul ; GCN-NOT: v_max @@ -548,11 +551,14 @@ } ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode: -; GFX9: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}} -; VI-FLUSH: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}} -; VI-FLUSH: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]] +; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] + +; GFX9: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]] + +; VI-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]] +; VI-FLUSH: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]] -; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}} +; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]] ; GCN-NOT: v_mul ; GCN-NOT: v_max @@ -703,16 +709,21 @@ ; Need to quiet the nan with a separate instruction since it will be ; passed through the minnum. +; FIXME: canonicalize doens't work correctly without ieee_mode ; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode: +; GFX9-NOT: v0 +; GFX9-NOT: v1 ; GFX9: v_min_f32_e32 v0, v0, v1 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX9-DENORM-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-NEXT: ; return to shader -; VI: v_min_f32_e32 v0, v0, v1 -; VI-FLUSH: v_mul_f32_e32 v0, 1.0, v0 -; VI-DENORM: v_max_f32_e32 v0, v0, v0 +; VI-FLUSH: v_min_f32_e32 v0, v0, v1 +; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VI-FLUSH-NEXT: ; return + +; VI-DENORM-NOT: v0 +; VI-DENORM: v_min_f32_e32 v0, v0, v1 +; VI-DENORM-NEXT: ; return define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) { %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1) %canonicalized = tail call float @llvm.canonicalize.f32(float %v) @@ -723,8 +734,14 @@ ; GFX9: v_min_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 -; VI: v_min_f32_e32 v0, v0, v1 -; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VI-FLUSH-DAG: v_mul_f32_e32 v0, 1.0, v0 +; VI-FLUSH-DAG: v_mul_f32_e32 v1, 1.0, v1 +; VI-FLUSH: v_min_f32_e32 v0, v0, v1 + +; VI-DENORM-DAG: v_max_f32_e32 v0, v0, v0 +; VI-DENORM-DAG: v_max_f32_e32 v1, v1, v1 +; VI-DENORM: v_min_f32_e32 v0, v0, v1 + ; VI-NEXT: s_setpc_b64 define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) { %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1) Index: test/CodeGen/AMDGPU/fmax3.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fmax3.f64.ll +++ test/CodeGen/AMDGPU/fmax3.f64.ll @@ -4,11 +4,14 @@ declare double @llvm.maxnum.f64(double, double) nounwind readnone ; SI-LABEL: {{^}}test_fmax3_f64: -; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}} -; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8 -; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]] +; SI: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}} +; SI: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8 ; SI: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16 -; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]] +; SI: v_max_f64 [[QUIET_A:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGA]] +; SI: v_max_f64 [[QUIET_B:v\[[0-9]+:[0-9]+\]]], [[REGB]], [[REGB]] +; SI: v_max_f64 [[MAX0:v\[[0-9]+:[0-9]+\]]], [[QUIET_A]], [[QUIET_B]] +; SI: v_max_f64 [[QUIET_C:v\[[0-9]+:[0-9]+\]]], [[REGC]], [[REGC]] +; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[MAX0]], [[QUIET_C]] ; SI: buffer_store_dwordx2 [[RESULT]], ; SI: s_endpgm define amdgpu_kernel void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { Index: test/CodeGen/AMDGPU/fmax3.ll =================================================================== --- test/CodeGen/AMDGPU/fmax3.ll +++ test/CodeGen/AMDGPU/fmax3.ll @@ -48,8 +48,11 @@ ; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_A]], [[CVT_B]], [[CVT_C]] ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] -; VI: v_max_f16_e32 -; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], +; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] +; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[MAX0]], [[QUIET_C]] ; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] ; GCN: buffer_store_short [[RESULT]], @@ -75,8 +78,11 @@ ; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]] ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] -; VI: v_max_f16_e32 -; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], +; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] +; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[MAX0]] ; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]] ; GCN: buffer_store_short [[RESULT]], @@ -100,22 +106,25 @@ ; SI-NEXT: v_max3_f32 ; SI-NEXT: v_max3_f32 -; VI: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_max_f16_e32 v0, v0, v1 -; VI: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI: v_max_f16_e32 v0, v2, v0 -; VI: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_max_f16_e32 v0, v0, v3 -; VI: v_or_b32_e32 v0, v0, v1 - -; GFX9: v_pk_max_f16 +; VI: s_waitcnt +; VI-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_max_f16_e32 v0, v2, v0 +; VI-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_max_f16_e32 v0, v0, v3 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 + +; GFX9: s_waitcnt +; GFX9-NEXT: v_pk_max_f16 ; GFX9-NEXT: v_pk_max_f16 ; GFX9-NEXT: v_pk_max_f16 -define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) { +define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 { entry: - %max = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) - %max1 = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max) - %res = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d) + %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) + %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max) + %res = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d) ret <2 x half> %res } @@ -126,3 +135,4 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone speculatable } +attributes #2 = { nounwind "no-nans-fp-math"="true" } Index: test/CodeGen/AMDGPU/fmax_legacy.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -97,7 +97,7 @@ ; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NNAN-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v1 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16: @@ -178,7 +178,7 @@ ; VI-NNAN-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v2 ; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v3 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16: @@ -283,8 +283,8 @@ ; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v3 ; VI-NNAN-NEXT: v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v2 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v5 -; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v4 +; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16: @@ -437,10 +437,10 @@ ; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v5 ; VI-NNAN-NEXT: v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v4 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v11 -; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v10 -; VI-NNAN-NEXT: v_or_b32_e32 v2, v2, v9 -; VI-NNAN-NEXT: v_or_b32_e32 v3, v3, v8 +; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16: Index: test/CodeGen/AMDGPU/fmax_legacy.ll =================================================================== --- test/CodeGen/AMDGPU/fmax_legacy.ll +++ test/CodeGen/AMDGPU/fmax_legacy.ll @@ -1,13 +1,22 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=FUNC %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-NONAN -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN-SAFE,SI-SAFE,GCN,FUNC %s +; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s + +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN-SAFE,GCN,FUNC %s +; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s + ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #1 ; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32: -; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] + +; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[A]], [[B]] +; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX @@ -26,12 +35,16 @@ } ; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_nnan_src: -; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN-DAG: v_add_f32_e32 [[ADD_A:v[0-9]+]], 1.0, [[A]] ; GCN-DAG: v_add_f32_e32 [[ADD_B:v[0-9]+]], 2.0, [[B]] -; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]] +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]] + +; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[ADD_A]], [[ADD_B]] +; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[ADD_B]], [[ADD_A]] + ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]] ; EG: MAX @@ -52,9 +65,14 @@ } ; FUNC-LABEL: {{^}}test_fmax_legacy_oge_f32: -; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] + +; VI-SAFE: v_cmp_ge_f32_e32 vcc, [[A]], [[B]] +; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { @@ -72,9 +90,15 @@ } ; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32: -; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] + +; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[A]], [[B]] +; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + + ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { @@ -92,9 +116,14 @@ } ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32: -; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] + +; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]] +; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { @@ -112,9 +141,15 @@ } ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32: -; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] + +; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]] +; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + + ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { @@ -132,12 +167,24 @@ } ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32: -; GCN-SAFE: v_max_legacy_f32_e32 -; GCN-SAFE: v_max_legacy_f32_e32 -; GCN-SAFE: v_max_legacy_f32_e32 +; SI-SAFE: v_max_legacy_f32_e32 +; SI-SAFE: v_max_legacy_f32_e32 +; SI-SAFE: v_max_legacy_f32_e32 + +; VI-SAFE: v_cmp_gt_f32_e32 +; VI-SAFE: v_cndmask_b32_e32 +; VI-SAFE: v_cmp_gt_f32_e32 +; VI-SAFE: v_cndmask_b32_e32 +; VI-SAFE: v_cmp_gt_f32_e32 +; VI-SAFE: v_cndmask_b32_e32 +; VI-SAFE-NOT: v_cmp +; VI-SAFE-NOT: v_cndmask + ; GCN-NONAN: v_max_f32_e32 ; GCN-NONAN: v_max_f32_e32 ; GCN-NONAN: v_max_f32_e32 + +; GCN-NOT: v_max define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid @@ -153,8 +200,8 @@ } ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32_multi_use: -; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN-NOT: v_max_ ; GCN: v_cmp_gt_f32 ; GCN-NEXT: v_cndmask_b32 Index: test/CodeGen/AMDGPU/fmaxnum.ll =================================================================== --- test/CodeGen/AMDGPU/fmaxnum.ll +++ test/CodeGen/AMDGPU/fmaxnum.ll @@ -1,14 +1,26 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; GCN-LABEL: {{^}}test_fmax_f32: -; GCN: v_max_f32_e32 -define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) #0 { - %val = call float @llvm.maxnum.f32(float %a, float %b) +; GCN-LABEL: {{^}}test_fmax_f32_ieee_mode_on: +; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}} +; GCN: v_mul_f32_e64 [[QUIET1:v[0-9]+]], 1.0, s{{[0-9]+}} +; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[QUIET1]], [[QUIET0]] +; GCN-NOT: [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @test_fmax_f32_ieee_mode_on(float addrspace(1)* %out, float %a, float %b) #0 { + %val = call float @llvm.maxnum.f32(float %a, float %b) #1 store float %val, float addrspace(1)* %out, align 4 ret void } +; GCN-LABEL: {{^}}test_fmax_f32_ieee_mode_off: +; GCN: v_max_f32_e32 v0, v0, v1 +; GCN-NEXT: ; return +define amdgpu_ps float @test_fmax_f32_ieee_mode_off(float %a, float %b) #0 { + %val = call float @llvm.maxnum.f32(float %a, float %b) #1 + ret float %val +} + ; GCN-LABEL: {{^}}test_fmax_v2f32: ; GCN: v_max_f32_e32 ; GCN: v_max_f32_e32 @@ -158,38 +170,34 @@ ret void } -; GCN-LABEL: {{^}}fmax_var_immediate_f32: +; GCN-LABEL: {{^}}fmax_var_immediate_f32_no_ieee: ; GCN: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0 -define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) #0 { - %val = call float @llvm.maxnum.f32(float %a, float 2.0) - store float %val, float addrspace(1)* %out, align 4 - ret void +define amdgpu_ps float @fmax_var_immediate_f32_no_ieee(float inreg %a) #0 { + %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0 + ret float %val } -; GCN-LABEL: {{^}}fmax_immediate_var_f32: +; GCN-LABEL: {{^}}fmax_immediate_var_f32_no_ieee: ; GCN: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0 -define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) #0 { - %val = call float @llvm.maxnum.f32(float 2.0, float %a) - store float %val, float addrspace(1)* %out, align 4 - ret void +define amdgpu_ps float @fmax_immediate_var_f32_no_ieee(float inreg %a) #0 { + %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0 + ret float %val } -; GCN-LABEL: {{^}}fmax_var_literal_f32: +; GCN-LABEL: {{^}}fmax_var_literal_f32_no_ieee: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 ; GCN: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] -define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) #0 { - %val = call float @llvm.maxnum.f32(float %a, float 99.0) - store float %val, float addrspace(1)* %out, align 4 - ret void +define amdgpu_ps float @fmax_var_literal_f32_no_ieee(float inreg %a) #0 { + %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0 + ret float %val } -; GCN-LABEL: {{^}}fmax_literal_var_f32: +; GCN-LABEL: {{^}}fmax_literal_var_f32_no_ieee: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 ; GCN: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] -define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) #0 { - %val = call float @llvm.maxnum.f32(float 99.0, float %a) - store float %val, float addrspace(1)* %out, align 4 - ret void +define amdgpu_ps float @fmax_literal_var_f32_no_ieee(float inreg %a) #0 { + %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0 + ret float %val } ; GCN-LABEL: {{^}}test_func_fmax_v3f32: Index: test/CodeGen/AMDGPU/fmin3.ll =================================================================== --- test/CodeGen/AMDGPU/fmin3.ll +++ test/CodeGen/AMDGPU/fmin3.ll @@ -95,22 +95,26 @@ ; SI-NEXT: v_min3_f32 ; SI-NEXT: v_min3_f32 -; VI: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_min_f16_e32 v0, v0, v1 -; VI: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI: v_min_f16_e32 v0, v2, v0 -; VI: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_min_f16_e32 v0, v0, v3 -; VI: v_or_b32_e32 v0, v0, v1 - -; GFX9: v_pk_min_f16 -; GFX9: v_pk_min_f16 -; GFX9: v_pk_min_f16 -define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) { +; VI: s_waitcnt +; VI-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_min_f16_e32 v0, v2, v0 +; VI-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_min_f16_e32 v0, v0, v3 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 + +; GFX9: s_waitcnt +; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX9-NEXT: v_pk_min_f16 v0, v2, v0 +; GFX9-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX9-NEXT: s_setpc_b64 +define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 { entry: - %min = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) - %min1 = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min) - %res = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %min1, <2 x half> %d) + %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) + %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min) + %res = call <2 x half> @llvm.minnum.v2f16(<2 x half> %min1, <2 x half> %d) ret <2 x half> %res } @@ -121,3 +125,4 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone speculatable } +attributes #2 = { nounwind "no-nans-fp-math"="true" } Index: test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll =================================================================== --- test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll +++ test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll @@ -1,9 +1,19 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-NONAN -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,SI %s +; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN,SI %s + +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,VI %s +; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN,VI %s ; GCN-LABEL: {{^}}min_fneg_select_regression_0: -; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 -; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, -1.0 +; GCN-NOT: v_mul + +; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 + +; VI-SAFE: v_cmp_nle_f32_e32 vcc, 1.0, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 + +; GCN-NONAN: v_max_f32_e64 v0, -v0, -1.0 define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 { %fneg.a = fsub float -0.0, %a %cmp.a = fcmp ult float %a, 1.0 @@ -12,7 +22,14 @@ } ; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0: -; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 +; GCN-NOT: v_mul + +; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 + +; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc +; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 + ; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0 define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 { %fneg.a = fsub float -0.0, %a @@ -22,9 +39,16 @@ } ; GCN-LABEL: {{^}}max_fneg_select_regression_0: -; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 +; GCN-NOT: v_mul + +; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 + +; VI-SAFE: v_cmp_nge_f32_e32 vcc, 1.0, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 + ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0 -define amdgpu_ps float @max_fneg_select_regression_0(float %a, float %b) #0 { +define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 { %fneg.a = fsub float -0.0, %a %cmp.a = fcmp ugt float %a, 1.0 %min.a = select i1 %cmp.a, float %fneg.a, float -1.0 @@ -32,9 +56,16 @@ } ; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0: -; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 +; GCN-NOT: v_mul + +; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 + +; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc +; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 + ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0 -define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a, float %b) #0 { +define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 { %fneg.a = fsub float -0.0, %a %cmp.a = fcmp ugt float %a, -1.0 %min.a = select i1 %cmp.a, float %fneg.a, float 1.0 Index: test/CodeGen/AMDGPU/fmin_legacy.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -98,7 +98,7 @@ ; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NNAN-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmin_legacy_ule_v2f16: @@ -179,7 +179,7 @@ ; VI-NNAN-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v2 ; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v3 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmin_legacy_ule_v3f16: @@ -284,8 +284,8 @@ ; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v3 ; VI-NNAN-NEXT: v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v2 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v5 -; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v4 +; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmin_legacy_ule_v4f16: @@ -438,10 +438,10 @@ ; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v5 ; VI-NNAN-NEXT: v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v4 -; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v11 -; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v10 -; VI-NNAN-NEXT: v_or_b32_e32 v2, v2, v9 -; VI-NNAN-NEXT: v_or_b32_e32 v3, v3, v8 +; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NNAN-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; SI-SAFE-LABEL: test_fmin_legacy_ule_v8f16: Index: test/CodeGen/AMDGPU/fmin_legacy.ll =================================================================== --- test/CodeGen/AMDGPU/fmin_legacy.ll +++ test/CodeGen/AMDGPU/fmin_legacy.ll @@ -1,5 +1,9 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-NONAN -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN-SAFE,SI-SAFE,GCN,FUNC %s +; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s + +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN-SAFE,GCN,FUNC %s +; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s + ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #1 @@ -10,8 +14,13 @@ ; FUNC-LABEL: {{^}}s_test_fmin_legacy_subreg_inputs_f32: ; EG: MIN * -; GCN-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; GCN-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} + +; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} + +; VI-SAFE: v_cmp_nlt_f32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} + +; VI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(float addrspace(1)* %out, <4 x float> %reg0) #0 { %r0 = extractelement <4 x float> %reg0, i32 0 %r1 = extractelement <4 x float> %reg0, i32 1 @@ -22,13 +31,17 @@ } ; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32: -; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]] -; GCN-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]] -; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]] +; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]] + +; VI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]] +; VI-SAFE: v_cmp_ngt_f32_e32 vcc, s[[A]], [[VB]] +; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[VB]], [[VA]] +; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]] define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 { %cmp = fcmp ule float %a, %b %val = select i1 %cmp, float %a, float %b @@ -36,13 +49,19 @@ ret void } +; Nsz also needed +; FIXME: Should separate tests ; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src: -; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[A]], 1.0 ; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[B]], 2.0 -; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]] +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]] + +; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[ADD_A]], [[ADD_B]] +; VI-SAFE: v_cndmask_b32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]], vcc + ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]] define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(float addrspace(1)* %out, float %a, float %b) #0 { %a.nnan = fadd nnan float %a, 1.0 @@ -54,9 +73,14 @@ } ; FUNC-LABEL: {{^}}test_fmin_legacy_ule_f32: -; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] + +; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[A]], [[B]] +; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 @@ -73,9 +97,14 @@ } ; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32: -; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] + +; VI-SAFE v_cmp_le_f32_e32 vcc, [[A]], [[B]] +; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 @@ -92,9 +121,14 @@ } ; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32: -; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] + +; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]] +; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 @@ -111,9 +145,14 @@ } ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32: -; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] + +; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]] +; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 @@ -130,9 +169,14 @@ } ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32: -; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] + +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] + +; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]] +; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] + ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 @@ -149,10 +193,15 @@ } ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32: -; GCN: buffer_load_dwordx2 -; GCN: buffer_load_dwordx2 -; GCN-SAFE: v_min_legacy_f32_e32 -; GCN-SAFE: v_min_legacy_f32_e32 +; GCN: {{buffer|flat}}_load_dwordx2 +; GCN: {{buffer|flat}}_load_dwordx2 +; SI-SAFE: v_min_legacy_f32_e32 +; SI-SAFE: v_min_legacy_f32_e32 + +; VI-SAFE v_cmp_lt_f32_e32 +; VI-SAFE: v_cndmask_b32_e32 +; VI-SAFE v_cmp_lt_f32_e32 +; VI-SAFE: v_cndmask_b32_e32 ; GCN-NONAN: v_min_f32_e32 ; GCN-NONAN: v_min_f32_e32 @@ -171,13 +220,24 @@ } ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32: -; GCN-SAFE: v_min_legacy_f32_e32 -; GCN-SAFE: v_min_legacy_f32_e32 -; GCN-SAFE: v_min_legacy_f32_e32 +; SI-SAFE: v_min_legacy_f32_e32 +; SI-SAFE: v_min_legacy_f32_e32 +; SI-SAFE: v_min_legacy_f32_e32 +; SI-SAFE-NOT: v_min_ + +; VI-SAFE: v_cmp_nge_f32_e32 +; VI-SAFE: v_cndmask_b32_e32 +; VI-SAFE: v_cmp_nge_f32_e32 +; VI-SAFE: v_cndmask_b32_e32 +; VI-SAFE: v_cmp_nge_f32_e32 +; VI-SAFE: v_cndmask_b32_e32 +; VI-NOT: v_cmp +; VI-NOT: v_cndmask ; GCN-NONAN: v_min_f32_e32 ; GCN-NONAN: v_min_f32_e32 ; GCN-NONAN: v_min_f32_e32 +; GCN-NONAN-NOT: v_min_ define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid @@ -193,8 +253,8 @@ } ; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32_multi_use: -; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN-NOT: v_min ; GCN: v_cmp_le_f32 ; GCN-NEXT: v_cndmask_b32 Index: test/CodeGen/AMDGPU/fminnum.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fminnum.f64.ll +++ test/CodeGen/AMDGPU/fminnum.f64.ll @@ -7,15 +7,35 @@ declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0 declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0 -; FUNC-LABEL: @test_fmin_f64 -; SI: v_min_f64 -define amdgpu_kernel void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind { +; FUNC-LABEL: {{^}}test_fmin_f64_ieee: +; SI: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]] +; SI: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]] +; SI-DAG: v_max_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], [[A]], [[A]] +; SI-DAG: v_max_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], [[B]], [[B]] +; SI: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETA]], [[QUIETB]] +define amdgpu_kernel void @test_fmin_f64_ieee([8 x i32], double %a, [8 x i32], double %b) nounwind { + %val = call double @llvm.minnum.f64(double %a, double %b) #0 + store double %val, double addrspace(1)* undef, align 8 + ret void +} + +; FUNC-LABEL: {{^}}test_fmin_f64_no_ieee: +; SI: ds_read_b64 [[VAL0:v\[[0-9]+:[0-9]+\]]] +; SI: ds_read_b64 [[VAL1:v\[[0-9]+:[0-9]+\]]] +; SI-NOT: [[VAL0]] +; SI-NOT: [[VAL1]] +; SI: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VAL0]], [[VAL1]] +; SI-NOT: [[RESULT]] +; SI: ds_write_b64 v{{[0-9]+}}, [[RESULT]] +define amdgpu_ps void @test_fmin_f64_no_ieee() nounwind { + %a = load volatile double, double addrspace(3)* undef + %b = load volatile double, double addrspace(3)* undef %val = call double @llvm.minnum.f64(double %a, double %b) #0 - store double %val, double addrspace(1)* %out, align 8 + store volatile double %val, double addrspace(3)* undef ret void } -; FUNC-LABEL: @test_fmin_v2f64 +; FUNC-LABEL: {{^}}test_fmin_v2f64: ; SI: v_min_f64 ; SI: v_min_f64 define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { @@ -24,7 +44,7 @@ ret void } -; FUNC-LABEL: @test_fmin_v4f64 +; FUNC-LABEL: {{^}}test_fmin_v4f64: ; SI: v_min_f64 ; SI: v_min_f64 ; SI: v_min_f64 @@ -35,7 +55,7 @@ ret void } -; FUNC-LABEL: @test_fmin_v8f64 +; FUNC-LABEL: {{^}}test_fmin_v8f64: ; SI: v_min_f64 ; SI: v_min_f64 ; SI: v_min_f64 @@ -50,7 +70,7 @@ ret void } -; FUNC-LABEL: @test_fmin_v16f64 +; FUNC-LABEL: {{^}}test_fmin_v16f64: ; SI: v_min_f64 ; SI: v_min_f64 ; SI: v_min_f64 Index: test/CodeGen/AMDGPU/fminnum.ll =================================================================== --- test/CodeGen/AMDGPU/fminnum.ll +++ test/CodeGen/AMDGPU/fminnum.ll @@ -1,14 +1,45 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; GCN-LABEL: {{^}}test_fmin_f32: -; GCN: v_min_f32_e32 -define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) #0 { - %val = call float @llvm.minnum.f32(float %a, float %b) +; GCN-LABEL: {{^}}test_fmin_f32_ieee_mode_on: +; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}} +; GCN: v_mul_f32_e64 [[QUIET1:v[0-9]+]], 1.0, s{{[0-9]+}} +; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[QUIET1]], [[QUIET0]] +; GCN-NOT: [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @test_fmin_f32_ieee_mode_on(float addrspace(1)* %out, float %a, float %b) #0 { + %val = call float @llvm.minnum.f32(float %a, float %b) #1 store float %val, float addrspace(1)* %out, align 4 ret void } +; GCN-LABEL: {{^}}test_fmin_nnan_f32_ieee_mode_on: +; GCN: s_waitcnt +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: s_setpc_b64 +define float @test_fmin_nnan_f32_ieee_mode_on(float %a, float %b) #0 { + %val = call nnan float @llvm.minnum.f32(float %a, float %b) #1 + ret float %val +} + +; GCN-LABEL: {{^}}test_fmin_nnan_f32_ieee_mode_off: +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: ; return +define amdgpu_ps float @test_fmin_nnan_f32_ieee_mode_off(float %a, float %b) #0 { + %val = call nnan float @llvm.minnum.f32(float %a, float %b) #1 + ret float %val +} + +; GCN-LABEL: {{^}}test_fmin_f32_ieee_mode_off: +; GCN: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: ; return +define amdgpu_ps float @test_fmin_f32_ieee_mode_off(float %a, float %b) #0 { + %val = call float @llvm.minnum.f32(float %a, float %b) #1 + ret float %val +} + ; GCN-LABEL: {{^}}test_fmin_v2f32: ; GCN: v_min_f32_e32 ; GCN: v_min_f32_e32 @@ -147,38 +178,34 @@ ret void } -; GCN-LABEL: {{^}}fmin_var_immediate_f32: -; GCN: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0 -define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) #0 { - %val = call float @llvm.minnum.f32(float %a, float 2.0) - store float %val, float addrspace(1)* %out, align 4 - ret void +; GCN-LABEL: {{^}}fmin_var_immediate_f32_no_ieee: +; GCN: v_min_f32_e32 v0, 2.0, v0 +define amdgpu_ps float @fmin_var_immediate_f32_no_ieee(float %a) #0 { + %val = call float @llvm.minnum.f32(float %a, float 2.0) #1 + ret float %val } -; GCN-LABEL: {{^}}fmin_immediate_var_f32: +; GCN-LABEL: {{^}}fmin_immediate_var_f32_no_ieee: ; GCN: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0 -define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) #0 { - %val = call float @llvm.minnum.f32(float 2.0, float %a) - store float %val, float addrspace(1)* %out, align 4 - ret void +define amdgpu_ps float @fmin_immediate_var_f32_no_ieee(float inreg %a) #0 { + %val = call float @llvm.minnum.f32(float 2.0, float %a) #1 + ret float %val } -; GCN-LABEL: {{^}}fmin_var_literal_f32: +; GCN-LABEL: {{^}}fmin_var_literal_f32_no_ieee: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 ; GCN: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] -define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) #0 { - %val = call float @llvm.minnum.f32(float %a, float 99.0) - store float %val, float addrspace(1)* %out, align 4 - ret void +define amdgpu_ps float @fmin_var_literal_f32_no_ieee(float inreg %a) #0 { + %val = call float @llvm.minnum.f32(float %a, float 99.0) #1 + ret float %val } -; GCN-LABEL: {{^}}fmin_literal_var_f32: +; GCN-LABEL: {{^}}fmin_literal_var_f32_no_ieee: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 ; GCN: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] -define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) #0 { - %val = call float @llvm.minnum.f32(float 99.0, float %a) - store float %val, float addrspace(1)* %out, align 4 - ret void +define amdgpu_ps float @fmin_literal_var_f32_no_ieee(float inreg %a) #0 { + %val = call float @llvm.minnum.f32(float 99.0, float %a) #1 + ret float %val } ; GCN-LABEL: {{^}}test_func_fmin_v3f32: Index: test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-combines.ll +++ test/CodeGen/AMDGPU/fneg-combines.ll @@ -396,12 +396,14 @@ ; fminnum tests ; -------------------------------------------------------------------------------- -; GCN-LABEL: {{^}}v_fneg_minnum_f32: +; GCN-LABEL: {{^}}v_fneg_minnum_f32_ieee: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]] +; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] +; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] +; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -415,11 +417,23 @@ ret void } -; GCN-LABEL: {{^}}v_fneg_self_minnum_f32: +; GCN-LABEL: {{^}}v_fneg_minnum_f32_no_ieee: +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: v_max_f32_e64 v0, -v0, -v1 +; GCN-NEXT: ; return +define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 { + %min = call float @llvm.minnum.f32(float %a, float %b) + %fneg = fsub float -0.000000e+00, %min + ret float %fneg +} + +; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_ieee: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]] +; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] +; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -431,11 +445,22 @@ ret void } -; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32: +; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_no_ieee: +; GCN-NOT: v0 +; GCN: v_max_f32_e64 v0, -v0, -v0 +; GCN-NEXT: ; return +define amdgpu_ps float @v_fneg_self_minnum_f32_no_ieee(float %a) #0 { + %min = call float @llvm.minnum.f32(float %a, float %a) + %min.fneg = fsub float -0.0, %min + ret float %min.fneg +} + +; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_ieee: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0 +; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] +; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -447,11 +472,22 @@ ret void } -; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32: +; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_no_ieee: +; GCN-NOT: v0 +; GCN: v_max_f32_e64 v0, -v0, -4.0 +; GCN-NEXT: ; return +define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 { + %min = call float @llvm.minnum.f32(float 4.0, float %a) + %fneg = fsub float -0.000000e+00, %min + ret float %fneg +} + +; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_ieee: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0 +; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] +; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -463,6 +499,16 @@ ret void } +; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_no_ieee: +; GCN-NOT: v0 +; GCN: v_max_f32_e64 v0, -v0, 4.0 +; GCN-NEXT: ; return +define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 { + %min = call float @llvm.minnum.f32(float -4.0, float %a) + %fneg = fsub float -0.000000e+00, %min + ret float %fneg +} + ; GCN-LABEL: {{^}}v_fneg_0_minnum_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]] @@ -479,11 +525,12 @@ ret void } -; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32: +; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_ieee: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0 +; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] +; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -498,10 +545,11 @@ ; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32: ; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 -; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[K]] +; SI-DAG: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]] +; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]] -; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[A]] +; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]] +; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]] ; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -520,10 +568,11 @@ ; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32: ; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983 -; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[K]] +; SI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]] +; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0x3e22f983, [[NEG_QUIET]] -; VI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0.15915494 +; VI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]] +; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { @@ -545,7 +594,8 @@ ; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0xbe230000, [[CVT]] ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]] -; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[A]] +; VI: v_max_f16_e32 [[QUIET:v[0-9]+]], [[A]], [[A]] +; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]] ; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x8000, [[MAX]] ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -568,7 +618,8 @@ ; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0x3e230000, [[CVT]] ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]] -; VI: v_max_f16_e64 [[RESULT:v[0-9]+]], -[[A]], 0.15915494 +; VI: v_max_f16_e64 [[NEG_QUIET:v[0-9]+]], -[[A]], -[[A]] +; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]] ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 { @@ -588,7 +639,8 @@ ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfc45f30 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882 -; SI: v_max_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, -[[A]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} +; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] +; SI: v_max_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} ; VI: v_min_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[A]], 0.15915494 ; VI: v_xor_b32_e32 v[[RESULT_HI]], 0x80000000, v[[RESULT_HI]] @@ -611,9 +663,11 @@ ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fc45f30 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882 -; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} +; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] +; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} -; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], 0.15915494 +; VI: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] +; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], 0.15915494 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 { @@ -638,13 +692,14 @@ ret float %fneg } -; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32: +; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_ieee: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]] +; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[QUIET_A]] ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -660,15 +715,16 @@ } ; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32: -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; SI: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]] -; SI: v_max_f32_e64 [[MIN:v[0-9]+]], -[[A]], [[K]] +; SI: v_max_f32_e32 [[MIN:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]] ; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[B]] -; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[A]] +; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]] +; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[QUIET]] ; VI: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -687,14 +743,29 @@ ret void } -; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32: +; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_no_ieee: +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, v0 +; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], v1 +; GCN-NEXT: ; return +define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 { + %min = call float @llvm.minnum.f32(float 0.0, float %a) + %fneg = fsub float -0.000000e+00, %min + %mul = fmul float %fneg, %b + ret float %mul +} + +; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_ieee: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_max_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]] +; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] +; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] +; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]] ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]] ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] -define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -710,16 +781,34 @@ ret void } +; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_no_ieee: +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: v_max_f32_e64 v0, -v0, -v1 +; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0 +; GCN-NEXT: ; return +define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 { + %min = call float @llvm.minnum.f32(float %a, float %b) + %fneg = fsub float -0.000000e+00, %min + %use1 = fmul float %min, 4.0 + %ins0 = insertelement <2 x float> undef, float %fneg, i32 0 + %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1 + ret <2 x float> %ins1 +} + ; -------------------------------------------------------------------------------- ; fmaxnum tests ; -------------------------------------------------------------------------------- -; GCN-LABEL: {{^}}v_fneg_maxnum_f32: + +; GCN-LABEL: {{^}}v_fneg_maxnum_f32_ieee: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]] +; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] +; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] +; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -727,60 +816,104 @@ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %min = call float @llvm.maxnum.f32(float %a, float %b) - %fneg = fsub float -0.000000e+00, %min + %max = call float @llvm.maxnum.f32(float %a, float %b) + %fneg = fsub float -0.000000e+00, %max store float %fneg, float addrspace(1)* %out.gep ret void } -; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32: +; GCN-LABEL: {{^}}v_fneg_maxnum_f32_no_ieee: +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: v_min_f32_e64 v0, -v0, -v1 +; GCN-NEXT: ; return +define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 { + %max = call float @llvm.maxnum.f32(float %a, float %b) + %fneg = fsub float -0.000000e+00, %max + ret float %fneg +} + +; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_ieee: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]] +; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] +; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep - %min = call float @llvm.maxnum.f32(float %a, float %a) - %min.fneg = fsub float -0.0, %min - store float %min.fneg, float addrspace(1)* %out.gep + %max = call float @llvm.maxnum.f32(float %a, float %a) + %max.fneg = fsub float -0.0, %max + store float %max.fneg, float addrspace(1)* %out.gep ret void } -; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32: +; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_no_ieee: +; GCN-NOT: v0 +; GCN: v_min_f32_e64 v0, -v0, -v0 +; GCN-NEXT: ; return +define amdgpu_ps float @v_fneg_self_maxnum_f32_no_ieee(float %a) #0 { + %max = call float @llvm.maxnum.f32(float %a, float %a) + %max.fneg = fsub float -0.0, %max + ret float %max.fneg +} + +; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_ieee: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0 +; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] +; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep - %min = call float @llvm.maxnum.f32(float 4.0, float %a) - %fneg = fsub float -0.000000e+00, %min + %max = call float @llvm.maxnum.f32(float 4.0, float %a) + %fneg = fsub float -0.000000e+00, %max store float %fneg, float addrspace(1)* %out.gep ret void } -; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32: +; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_no_ieee: +; GCN-NOT: v0 +; GCN: v_min_f32_e64 v0, -v0, -4.0 +; GCN-NEXT: ; return +define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 { + %max = call float @llvm.maxnum.f32(float 4.0, float %a) + %fneg = fsub float -0.000000e+00, %max + ret float %fneg +} + +; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_ieee: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0 +; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] +; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep - %min = call float @llvm.maxnum.f32(float -4.0, float %a) - %fneg = fsub float -0.000000e+00, %min + %max = call float @llvm.maxnum.f32(float -4.0, float %a) + %fneg = fsub float -0.000000e+00, %max store float %fneg, float addrspace(1)* %out.gep ret void } +; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_no_ieee: +; GCN-NOT: v0 +; GCN: v_min_f32_e64 v0, -v0, 4.0 +; GCN-NEXT: ; return +define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 { + %max = call float @llvm.maxnum.f32(float -4.0, float %a) + %fneg = fsub float -0.000000e+00, %max + ret float %fneg +} + ; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]] @@ -797,11 +930,12 @@ ret void } -; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32: +; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_ieee: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0 +; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] +; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { +define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -813,13 +947,24 @@ ret void } -; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32: +; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_no_ieee: +; GCN-NOT: v0 +; GCN: v_min_f32_e64 v0, -v0, 0{{$}} +; GCN-NEXT: ; return +define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 { + %max = call float @llvm.maxnum.f32(float -0.0, float %a) + %fneg = fsub float -0.000000e+00, %max + ret float %fneg +} + +; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_ieee: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]] +; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] +; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]] ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -834,14 +979,29 @@ ret void } -; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32: +; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_no_ieee: +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, v0 +; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], v1 +; GCN-NEXT: ; return +define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 { + %max = call float @llvm.maxnum.f32(float 0.0, float %a) + %fneg = fsub float -0.000000e+00, %max + %mul = fmul float %fneg, %b + ret float %mul +} + +; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_ieee: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_min_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]] +; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] +; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] +; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]] ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]] ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] -define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { +define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -849,14 +1009,29 @@ %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %min = call float @llvm.maxnum.f32(float %a, float %b) - %fneg = fsub float -0.000000e+00, %min - %use1 = fmul float %min, 4.0 + %max = call float @llvm.maxnum.f32(float %a, float %b) + %fneg = fsub float -0.000000e+00, %max + %use1 = fmul float %max, 4.0 store volatile float %fneg, float addrspace(1)* %out store volatile float %use1, float addrspace(1)* %out ret void } +; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_no_ieee: +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: v_min_f32_e64 v0, -v0, -v1 +; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0 +; GCN-NEXT: ; return +define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 { + %max = call float @llvm.maxnum.f32(float %a, float %b) + %fneg = fsub float -0.000000e+00, %max + %use1 = fmul float %max, 4.0 + %ins0 = insertelement <2 x float> undef, float %fneg, i32 0 + %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1 + ret <2 x float> %ins1 +} + ; -------------------------------------------------------------------------------- ; fma tests ; -------------------------------------------------------------------------------- Index: test/CodeGen/AMDGPU/known-never-snan.ll =================================================================== --- test/CodeGen/AMDGPU/known-never-snan.ll +++ test/CodeGen/AMDGPU/known-never-snan.ll @@ -99,8 +99,7 @@ ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 -; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 -; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %a.nnan.add = fdiv nnan float 1.0, %a %b.nnan.add = fadd nnan float %b, 1.0 @@ -110,14 +109,46 @@ ret float %med } +define float @v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32(float %a, float %b) #0 { +; GCN-LABEL: v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %b.nsnan = fadd float %b, 1.0 + %known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nsnan) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + +define float @v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32(float %a, float %b) #0 { +; GCN-LABEL: v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %a.nsnan = fadd float %a, 1.0 + %known.not.snan = call float @llvm.minnum.f32(float %a.nsnan, float %b) + %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0) + %med = call float @llvm.minnum.f32(float %max, float 4.0) + ret float %med +} + define float @v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b) #0 { ; GCN-LABEL: v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 -; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 -; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %b.nnan.add = fadd nnan float %b, 1.0 %known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nnan.add) @@ -131,9 +162,9 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 -; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 -; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %a.nnan.add = fdiv nnan float 1.0, %a %known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b) @@ -148,8 +179,8 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0 -; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_max_f32_e32 v0, v0, v1 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %a.nnan.add = fdiv nnan float 1.0, %a %b.nnan.add = fadd nnan float %b, 1.0 @@ -164,8 +195,9 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0 -; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_max_f32_e32 v0, v0, v1 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %b.nnan.add = fadd nnan float %b, 1.0 %known.not.snan = call float @llvm.maxnum.f32(float %a, float %b.nnan.add) @@ -179,8 +211,9 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0 -; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_max_f32_e32 v0, v0, v1 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %a.nnan.add = fdiv nnan float 1.0, %a %known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b) @@ -215,8 +248,8 @@ ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 -; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %b.nnan.add = fadd nnan float %b, 1.0 %cmp = icmp eq i32 %c, 0 @@ -233,8 +266,8 @@ ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 -; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %a.nnan.add = fdiv nnan float 1.0, %a %cmp = icmp eq i32 %c, 0 @@ -494,6 +527,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_med3_f32 v0, v0, v1, v2 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %known.not.snan = call float @llvm.amdgcn.fmed3.f32(float %a, float %b, float %c) @@ -507,8 +541,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_min3_f32 v0, v0, v1, v2 -; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0 -; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %min0 = call float @llvm.minnum.f32(float %a, float %b) %known.not.snan = call float @llvm.minnum.f32(float %min0, float %c) Index: test/CodeGen/AMDGPU/llvm.maxnum.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -1,23 +1,91 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s +; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SIVI %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s declare half @llvm.maxnum.f16(half %a, half %b) declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b) declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b) -; GCN-LABEL: {{^}}maxnum_f16: -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @maxnum_f16( +; SI-LABEL: maxnum_f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: maxnum_f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: maxnum_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -29,15 +97,65 @@ ret void } -; GCN-LABEL: {{^}}maxnum_f16_imm_a: -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @maxnum_f16_imm_a( +; SI-LABEL: maxnum_f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: maxnum_f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: maxnum_f16_imm_a: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %b) { entry: @@ -47,15 +165,65 @@ ret void } -; GCN-LABEL: {{^}}maxnum_f16_imm_b: -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @maxnum_f16_imm_b( +; SI-LABEL: maxnum_f16_imm_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: maxnum_f16_imm_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: maxnum_f16_imm_b: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -65,34 +233,79 @@ ret void } -; GCN-LABEL: {{^}}maxnum_v2f16: -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] - -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] -; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] -; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] - -; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @maxnum_v2f16( +; SI-LABEL: maxnum_v2f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s6, s[6:7], 0x0 +; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s1, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: s_lshr_b32 s0, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: maxnum_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dword s4, s[6:7], 0x0 +; VI-NEXT: s_load_dword s5, s[8:9], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_max_f16_e64 v1, s4, s4 +; VI-NEXT: v_max_f16_e64 v0, s5, s5 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: v_max_f16_e32 v0, v1, v0 +; VI-NEXT: v_max_f16_e64 v1, s5, s5 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: maxnum_v2f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 +; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -104,29 +317,64 @@ ret void } -; GCN-LABEL: {{^}}maxnum_v2f16_imm_a: -; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 -; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] - -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SIVI-NOT: and -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - - -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200 -; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] - -; GCN: buffer_store_dword v[[R_V2_F16]] define amdgpu_kernel void @maxnum_v2f16_imm_a( +; SI-LABEL: maxnum_v2f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_max_f32_e32 v1, 4.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: maxnum_v2f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_load_dword s4, s[6:7], 0x0 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_max_f16_e64 v1, s4, s4 +; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 +; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: maxnum_v2f16_imm_a: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x44004200 +; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: @@ -136,31 +384,64 @@ ret void } -; GCN-LABEL: {{^}}maxnum_v2f16_imm_b: -; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] - -; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200 -; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] - -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] - - -; SIVI-NOT: and -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400 -; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]] - -; GCN: buffer_store_dword v[[R_V2_F16]] define amdgpu_kernel void @maxnum_v2f16_imm_b( +; SI-LABEL: maxnum_v2f16_imm_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: maxnum_v2f16_imm_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0x4200 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_load_dword s4, s[6:7], 0x0 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_max_f16_e64 v1, s4, s4 +; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 +; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: maxnum_v2f16_imm_b: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x42004400 +; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: @@ -171,10 +452,94 @@ } ; FIXME: Scalarize with undef half -; GCN-LABEL: {{^}}maxnum_v3f16: -; GFX9: v_pk_max_f16 -; GFX9: v_pk_max_f16 define amdgpu_kernel void @maxnum_v3f16( +; SI-LABEL: maxnum_v3f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s1, s6, 16 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_max_f32_e32 v1, v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_max_f32_e32 v0, v0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: maxnum_v3f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_max_f16_e64 v1, s4, s4 +; VI-NEXT: v_max_f16_e64 v0, s6, s6 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: s_lshr_b32 s6, s6, 16 +; VI-NEXT: v_max_f16_e32 v0, v1, v0 +; VI-NEXT: v_max_f16_e64 v1, s6, s6 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_e64 v2, s5, s5 +; VI-NEXT: v_max_f16_e32 v1, v2, v1 +; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: maxnum_v3f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 +; GFX9-NEXT: v_pk_max_f16 v2, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm <3 x half> addrspace(1)* %r, <3 x half> addrspace(1)* %a, <3 x half> addrspace(1)* %b) { @@ -186,13 +551,107 @@ ret void } -; GCN-LABEL: {{^}}maxnum_v4f16: -; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} -; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}} -; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[B_LO]], v[[A_LO]] -; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[B_HI]], v[[A_HI]] -; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}} define amdgpu_kernel void @maxnum_v4f16( +; SI-LABEL: maxnum_v4f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_max_f32_e32 v3, v3, v5 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_max_f32_e32 v1, v1, v5 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_max_f32_e32 v2, v2, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v0, v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: maxnum_v4f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_max_f16_e64 v1, s5, s5 +; VI-NEXT: v_max_f16_e64 v0, s7, s7 +; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: v_max_f16_e32 v0, v1, v0 +; VI-NEXT: v_max_f16_e64 v2, s5, s5 +; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_max_f16_e64 v0, s6, s6 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: s_lshr_b32 s5, s6, 16 +; VI-NEXT: v_max_f16_e32 v0, v2, v0 +; VI-NEXT: v_max_f16_e64 v2, s5, s5 +; VI-NEXT: v_max_f16_e64 v3, s4, s4 +; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: maxnum_v4f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 +; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 +; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm <4 x half> addrspace(1)* %r, <4 x half> addrspace(1)* %a, <4 x half> addrspace(1)* %b) { @@ -204,28 +663,87 @@ ret void } -; GCN-LABEL: {{^}}fmax_v4f16_imm_a: -; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} -; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200 -; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800 - -; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[A_LO]], [[K0]] -; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[A_HI]], [[K1]] -; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}} - -; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000 -; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400 - -; VI-DAG: v_max_f16_sdwa v[[MAX_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_max_f16_e32 v[[MAX_HI_LO:[0-9]+]], 0x4200, v[[A_HI]] -; VI-DAG: v_max_f16_sdwa v[[MAX_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_max_f16_e32 v[[MAX_LO_LO:[0-9]+]], 0x4800, v[[A_LO]] - -; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MAX_LO_LO]], v[[MAX_LO_HI]] -; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MAX_HI_LO]], v[[MAX_HI_HI]] - -; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}} define amdgpu_kernel void @fmax_v4f16_imm_a( +; SI-LABEL: fmax_v4f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: s_lshr_b32 s5, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v3, 2.0, v3 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fmax_v4f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x4400 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_max_f16_e64 v1, s5, s5 +; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: v_max_f16_e64 v3, s5, s5 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fmax_v4f16_imm_a: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s8, 0x44004200 +; GFX9-NEXT: s_mov_b32 s9, 0x40004800 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 +; GFX9-NEXT: v_pk_max_f16 v2, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v1, v0, s8 +; GFX9-NEXT: v_pk_max_f16 v0, v2, s9 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm <4 x half> addrspace(1)* %r, <4 x half> addrspace(1)* %b) { entry: Index: test/CodeGen/AMDGPU/llvm.minnum.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -1,23 +1,91 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s +; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s declare half @llvm.minnum.f16(half %a, half %b) declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b) declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b) -; GCN-LABEL: {{^}}minnum_f16: -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm -define amdgpu_kernel void @minnum_f16( +define amdgpu_kernel void @minnum_f16_ieee( +; SI-LABEL: minnum_f16_ieee: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: minnum_f16_ieee: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: minnum_f16_ieee: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { @@ -29,15 +97,88 @@ ret void } -; GCN-LABEL: {{^}}minnum_f16_imm_a: -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm +define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) { +; SI-LABEL: minnum_f16_no_ieee: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: minnum_f16_no_ieee: +; VI: ; %bb.0: +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: minnum_f16_no_ieee: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9-NEXT: ; return to shader part epilog + %r.val = call half @llvm.minnum.f16(half %a, half %b) + ret half %r.val +} + define amdgpu_kernel void @minnum_f16_imm_a( +; SI-LABEL: minnum_f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: minnum_f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: minnum_f16_imm_a: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %b) { entry: @@ -47,15 +188,65 @@ ret void } -; GCN-LABEL: {{^}}minnum_f16_imm_b: -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @minnum_f16_imm_b( +; SI-LABEL: minnum_f16_imm_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: minnum_f16_imm_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: minnum_f16_imm_b: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a) { entry: @@ -65,33 +256,79 @@ ret void } -; GCN-LABEL: {{^}}minnum_v2f16: -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] - -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] -; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] -; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] - -; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] - -; GCN: buffer_store_dword v[[R_V2_F16]] -define amdgpu_kernel void @minnum_v2f16( +define amdgpu_kernel void @minnum_v2f16_ieee( +; SI-LABEL: minnum_v2f16_ieee: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s6, s[6:7], 0x0 +; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s1, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: s_lshr_b32 s0, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: minnum_v2f16_ieee: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dword s4, s[6:7], 0x0 +; VI-NEXT: s_load_dword s5, s[8:9], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_max_f16_e64 v1, s4, s4 +; VI-NEXT: v_max_f16_e64 v0, s5, s5 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: v_min_f16_e32 v0, v1, v0 +; VI-NEXT: v_max_f16_e64 v1, s5, s5 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: minnum_v2f16_ieee: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 +; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { @@ -103,29 +340,94 @@ ret void } -; GCN-LABEL: {{^}}minnum_v2f16_imm_a: -; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 -; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] - -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SIVI-NOT: and -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - - -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200 -; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] +define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) { +; SI-LABEL: minnum_v2f16_no_ieee: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v0, v0, v2 +; SI-NEXT: v_min_f32_e32 v1, v1, v3 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: minnum_v2f16_no_ieee: +; VI: ; %bb.0: +; VI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: minnum_v2f16_no_ieee: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX9-NEXT: ; return to shader part epilog + %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %r.val +} -; GCN: buffer_store_dword v[[R_V2_F16]] define amdgpu_kernel void @minnum_v2f16_imm_a( +; SI-LABEL: minnum_v2f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_min_f32_e32 v1, 4.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: minnum_v2f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_load_dword s4, s[6:7], 0x0 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_max_f16_e64 v1, s4, s4 +; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 +; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: minnum_v2f16_imm_a: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x44004200 +; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: @@ -135,31 +437,64 @@ ret void } -; GCN-LABEL: {{^}}minnum_v2f16_imm_b: -; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] - -; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200 -; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] - -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] - - -; SIVI-NOT: and -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400 -; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]] - -; GCN: buffer_store_dword v[[R_V2_F16]] define amdgpu_kernel void @minnum_v2f16_imm_b( +; SI-LABEL: minnum_v2f16_imm_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: minnum_v2f16_imm_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0x4200 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_load_dword s4, s[6:7], 0x0 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_max_f16_e64 v1, s4, s4 +; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 +; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: minnum_v2f16_imm_b: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x42004400 +; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: @@ -170,10 +505,94 @@ } ; FIXME: Scalarize with undef half -; GCN-LABEL: {{^}}minnum_v3f16: -; GFX9: v_pk_min_f16 -; GFX9: v_pk_min_f16 define amdgpu_kernel void @minnum_v3f16( +; SI-LABEL: minnum_v3f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s1, s6, 16 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_min_f32_e32 v2, v3, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_min_f32_e32 v1, v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_min_f32_e32 v0, v0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: minnum_v3f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_max_f16_e64 v1, s4, s4 +; VI-NEXT: v_max_f16_e64 v0, s6, s6 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: s_lshr_b32 s6, s6, 16 +; VI-NEXT: v_min_f16_e32 v0, v1, v0 +; VI-NEXT: v_max_f16_e64 v1, s6, s6 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_e64 v2, s5, s5 +; VI-NEXT: v_min_f16_e32 v1, v2, v1 +; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: minnum_v3f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, s6, s6 +; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX9-NEXT: v_pk_max_f16 v2, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 +; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm <3 x half> addrspace(1)* %r, <3 x half> addrspace(1)* %a, <3 x half> addrspace(1)* %b) { @@ -185,13 +604,107 @@ ret void } -; GCN-LABEL: {{^}}minnum_v4f16: -; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} -; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}} -; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[B_LO]], v[[A_LO]] -; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[B_HI]], v[[A_HI]] -; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}} define amdgpu_kernel void @minnum_v4f16( +; SI-LABEL: minnum_v4f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_min_f32_e32 v3, v3, v5 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_min_f32_e32 v1, v1, v5 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_min_f32_e32 v2, v2, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v0, v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: minnum_v4f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_max_f16_e64 v1, s5, s5 +; VI-NEXT: v_max_f16_e64 v0, s7, s7 +; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: v_min_f16_e32 v0, v1, v0 +; VI-NEXT: v_max_f16_e64 v2, s5, s5 +; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_max_f16_e64 v0, s6, s6 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: s_lshr_b32 s5, s6, 16 +; VI-NEXT: v_min_f16_e32 v0, v2, v0 +; VI-NEXT: v_max_f16_e64 v2, s5, s5 +; VI-NEXT: v_max_f16_e64 v3, s4, s4 +; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: minnum_v4f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 +; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 +; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm <4 x half> addrspace(1)* %r, <4 x half> addrspace(1)* %a, <4 x half> addrspace(1)* %b) { @@ -203,28 +716,87 @@ ret void } -; GCN-LABEL: {{^}}fmin_v4f16_imm_a: -; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}} -; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200 -; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800 - -; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[A_LO]], [[K0]] -; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[A_HI]], [[K1]] -; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}} - -; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000 -; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400 - -; VI-DAG: v_min_f16_sdwa v[[MIN_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_min_f16_e32 v[[MIN_HI_LO:[0-9]+]], 0x4200, v[[A_HI]] -; VI-DAG: v_min_f16_sdwa v[[MIN_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_min_f16_e32 v[[MIN_LO_LO:[0-9]+]], 0x4800, v[[A_LO]] - -; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MIN_LO_LO]], v[[MIN_LO_HI]] -; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MIN_HI_LO]], v[[MIN_HI_HI]] - -; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}} define amdgpu_kernel void @fmin_v4f16_imm_a( +; SI-LABEL: fmin_v4f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: s_lshr_b32 s5, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_min_f32_e32 v2, 4.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v3, 2.0, v3 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fmin_v4f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x4400 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_max_f16_e64 v1, s5, s5 +; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: v_max_f16_e64 v3, s5, s5 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fmin_v4f16_imm_a: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s8, 0x44004200 +; GFX9-NEXT: s_mov_b32 s9, 0x40004800 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 +; GFX9-NEXT: v_pk_max_f16 v2, s4, s4 +; GFX9-NEXT: v_pk_min_f16 v1, v0, s8 +; GFX9-NEXT: v_pk_min_f16 v0, v2, s9 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm <4 x half> addrspace(1)* %r, <4 x half> addrspace(1)* %b) { entry: Index: test/CodeGen/AMDGPU/reduction.ll =================================================================== --- test/CodeGen/AMDGPU/reduction.ll +++ test/CodeGen/AMDGPU/reduction.ll @@ -434,12 +434,23 @@ } ; GCN-LABEL: {{^}}reduction_maxnum_v4f16: -; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 - -; VI: v_max_f16_sdwa -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 +; GFX9: s_waitcnt +; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1 +; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0 +; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}} + +; FIXME: Extra canonicalize leftover +; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]] + +; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0 +; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1 + +; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]] +; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]] +; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]] define half @reduction_maxnum_v4f16(<4 x half> %vec4) { entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> @@ -451,12 +462,24 @@ } ; GCN-LABEL: {{^}}reduction_minnum_v4f16: -; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9: s_waitcnt +; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1 +; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0 +; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}} -; VI: v_min_f16_sdwa -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 +; FIXME: Extra canonicalize leftover +; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]] + + +; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0 +; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1 + +; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]] +; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]] +; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]] define half @reduction_minnum_v4f16(<4 x half> %vec4) { entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> @@ -471,9 +494,11 @@ ; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_max_f16_sdwa -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 +; VI: s_waitcnt +; VI-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) { entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> @@ -490,9 +515,11 @@ ; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_min_f16_sdwa -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 +; VI: s_waitcnt +; VI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_min_f16_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) { entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32>