Index: llvm/lib/Target/AMDGPU/AMDGPU.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPU.td +++ llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1715,6 +1715,8 @@ def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<(all_of FeatureVOP3P)>; +def NotHasMed3_16 : Predicate<"!Subtarget->hasMed3_16()">; + def HasMinMaxDenormModes : Predicate<"Subtarget->supportsMinMaxDenormModes()">; def NotHasMinMaxDenormModes : Predicate<"!Subtarget->supportsMinMaxDenormModes()">; Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -112,9 +112,33 @@ [{ return PostLegalizerHelper.matchCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }]), (apply [{ PostLegalizerHelper.applyCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }])>; -// Combines which should only apply on SI/VI + +let Predicates = [Has16BitInsts, NotHasMed3_16] in { +// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This +// saves one instruction compared to the promotion. +// +// FIXME: Should have ComplexPattern like in/out matchers +// +// FIXME: We should be able to match either G_AMDGPU_FMED3 or +// G_INTRINSIC @llvm.amdgcn.fmed3. Currently the legalizer will +// replace the intrinsic with G_AMDGPU_FMED3 since we can't write a +// pattern to match it. +def expand_promoted_fmed3 : GICombineRule< + (defs root:$fptrunc_dst), + (match (G_FPTRUNC $fptrunc_dst, $fmed3_dst):$fptrunc, + (G_AMDGPU_FMED3 $fmed3_dst, $src0, $src1, $src2), + [{ return Helper.matchExpandPromotedF16FMed3(*${fptrunc}, ${src0}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }]), + (apply [{ Helper.applyExpandPromotedF16FMed3(*${fptrunc}, ${src0}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }]) +>; + +} // End Predicates = [NotHasMed3_16] + +// Combines which should only apply on SI/CI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; +// Combines which should only apply on VI +def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>; + def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPreLegalizerCombinerHelper", [all_combines, clamp_i64_to_i16, foldable_fneg]> { @@ -125,7 +149,7 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPostLegalizerCombinerHelper", - [all_combines, gfx6gfx7_combines, + [all_combines, gfx6gfx7_combines, gfx8_combines, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, rcp_sqrt_to_rsq, sign_extension_in_reg]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; Index: llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h +++ llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h @@ -23,4 +23,9 @@ bool matchFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo); void applyFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo); + + bool matchExpandPromotedF16FMed3(MachineInstr &MI, Register Src0, + Register Src1, Register Src2); + void applyExpandPromotedF16FMed3(MachineInstr &MI, Register Src0, + Register Src1, Register Src2); }; Index: llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp @@ -380,3 +380,56 @@ MI.eraseFromParent(); } + +// TODO: Should return converted value / extension source and avoid introducing +// intermediate fptruncs in the apply function. +static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI, + Register Reg) { + const MachineInstr *Def = MRI.getVRegDef(Reg); + if (Def->getOpcode() == TargetOpcode::G_FPEXT) { + Register SrcReg = Def->getOperand(1).getReg(); + return MRI.getType(SrcReg) == LLT::scalar(16); + } + + if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) { + APFloat Val = Def->getOperand(1).getFPImm()->getValueAPF(); + bool LosesInfo = true; + Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo); + return !LosesInfo; + } + + return false; +} + +bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI, + Register Src0, + Register Src1, + Register Src2) { + assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC); + Register SrcReg = MI.getOperand(1).getReg(); + if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32)) + return false; + + return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) && + isFPExtFromF16OrConst(MRI, Src2); +} + +void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI, + Register Src0, + Register Src1, + Register Src2) { + Builder.setInstrAndDebugLoc(MI); + + // We expect fptrunc (fpext x) to fold out, and to constant fold any constant + // sources. + Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0); + Src1 = Builder.buildFPTrunc(LLT::scalar(16), Src1).getReg(0); + Src2 = Builder.buildFPTrunc(LLT::scalar(16), Src2).getReg(0); + + LLT Ty = MRI.getType(Src0); + auto A1 = Builder.buildFMinNumIEEE(Ty, Src0, Src1); + auto B1 = Builder.buildFMaxNumIEEE(Ty, Src0, Src1); + auto C1 = Builder.buildFMaxNumIEEE(Ty, A1, Src2); + Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1); + MI.eraseFromParent(); +} Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5885,6 +5885,17 @@ return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); case Intrinsic::amdgcn_image_bvh_intersect_ray: return legalizeBVHIntrinsic(MI, B); + case Intrinsic::amdgcn_fmed3: { + GISelChangeObserver &Observer = Helper.Observer; + + // FIXME: This is to workaround the inability of tablegen match combiners to + // match intrinsics in patterns. + Observer.changingInstr(MI); + MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3)); + MI.removeOperand(1); + Observer.changedInstr(MI); + return true; + } default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3762,6 +3762,7 @@ case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: case AMDGPU::G_AMDGPU_SMED3: + case AMDGPU::G_AMDGPU_FMED3: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -198,6 +198,7 @@ SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -772,6 +772,9 @@ ISD::INSERT_VECTOR_ELT, ISD::FCOPYSIGN}); + if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16()) + setTargetDAGCombine(ISD::FP_ROUND); + // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. setTargetDAGCombine({ISD::LOAD, @@ -11103,6 +11106,71 @@ return DAG.getBuildVector(VecVT, SL, Ops); } +/// Return the source of an fp_extend from f16 to f32, or a converted FP +/// constant. +static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) { + if (Src.getOpcode() == ISD::FP_EXTEND && + Src.getOperand(0).getValueType() == MVT::f16) { + return Src.getOperand(0); + } + + if (auto *CFP = dyn_cast(Src)) { + APFloat Val = CFP->getValueAPF(); + bool LosesInfo = true; + Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo); + if (!LosesInfo) + return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16); + } + + return SDValue(); +} + +SDValue SITargetLowering::performFPRoundCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() && + "combine only useful on gfx8"); + + SDValue TruncSrc = N->getOperand(0); + EVT VT = N->getValueType(0); + if (VT != MVT::f16) + return SDValue(); + + if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 || + TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse() || + !isNullConstant(N->getOperand(1))) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3, + // and expanding it with min/max saves 1 instruction vs. casting to f32 and + // casting back. + + // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) => + // fmin(fmax(a, b), fmax(fmin(a, b), c)) + SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0)); + if (!A) + return SDValue(); + + SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1)); + if (!B) + return SDValue(); + + SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2)); + if (!C) + return SDValue(); + + // This changes signaling nan behavior. If an input is a signaling nan, it + // would have been quieted by the fpext originally. We don't care because + // these are unconstrained ops. If we needed to insert quieting canonicalizes + // we would be worse off than just doing the promotion. + SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B); + SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B); + SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C); + return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1); +} + unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const { @@ -11858,6 +11926,8 @@ return performExtractVectorEltCombine(N, DCI); case ISD::INSERT_VECTOR_ELT: return performInsertVectorEltCombine(N, DCI); + case ISD::FP_ROUND: + return performFPRoundCombine(N, DCI); case ISD::LOAD: { if (SDValue Widended = widenLoad(cast(N), DCI)) return Widended; Index: llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll +++ llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll @@ -7,7 +7,7 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s -; Legal f16 med3. InstCombine ought to shrink the f32 op to f16. +; Legal f16 med3. InstCombine ought to shrink the f32 op to f16 so the codegen doesn't really matter for this. ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s @@ -35,15 +35,23 @@ ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: fmed3_f32_fpext_f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmed3_f32_fpext_f16: ; GFX9: ; %bb.0: @@ -81,15 +89,23 @@ ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: fmed3_f32_fpext_f16_flags: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_flags: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_flags: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmed3_f32_fpext_f16_flags: ; GFX9: ; %bb.0: @@ -181,14 +197,23 @@ ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: fmed3_f32_fpext_f16_k0: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_med3_f32 v0, 2.0, v0, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_max_f16_e32 v2, 2.0, v0 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, 2.0, v0 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v2, v0 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k0: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_min_f16_e32 v2, 2.0, v0 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, 2.0, v0 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v2, v1 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmed3_f32_fpext_f16_k0: ; GFX9: ; %bb.0: @@ -223,14 +248,23 @@ ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: fmed3_f32_fpext_f16_k1: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_med3_f32 v0, v0, 2.0, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k1: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_max_f16_e32 v2, 2.0, v0 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, 2.0, v0 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v2, v0 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k1: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_min_f16_e32 v2, 2.0, v0 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, 2.0, v0 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v2, v1 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmed3_f32_fpext_f16_k1: ; GFX9: ; %bb.0: @@ -265,14 +299,23 @@ ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: fmed3_f32_fpext_f16_k2: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_med3_f32 v0, v0, v1, 2.0 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k2: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, 2.0, v0 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v2, v0 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k2: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, 2.0, v2 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmed3_f32_fpext_f16_k2: ; GFX9: ; %bb.0: @@ -308,14 +351,25 @@ ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: fmed3_f32_fpext_f16_k0_k1: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x41800000 -; GFX8-NEXT: v_med3_f32 v0, 0, v1, v0 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k1: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_mov_b32_e32 v1, 0x4c00 +; GFX8-SDAG-NEXT: v_max_f16_e32 v2, 0, v1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v1, 0, v1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v1, v0 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v2, v0 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k0_k1: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0x4c00 +; GFX8-GISEL-NEXT: v_min_f16_e32 v2, 0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, 0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v2, v0 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v1, v0 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmed3_f32_fpext_f16_k0_k1: ; GFX9: ; %bb.0: @@ -348,13 +402,23 @@ ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: fmed3_f32_fpext_f16_k0_k2: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_med3_f32 v0, 0, v0, 2.0 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k2: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_max_f16_e32 v1, 0, v0 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, 0, v0 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, 2.0, v0 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v1, v0 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k0_k2: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_min_f16_e32 v1, 0, v0 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, 0, v0 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, 2.0, v1 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmed3_f32_fpext_f16_k0_k2: ; GFX9: ; %bb.0: @@ -394,15 +458,23 @@ ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: fmed3_f32_fpext_f16_fabs: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX8-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX8-NEXT: v_cvt_f32_f16_e64 v2, |v2| -; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_fabs: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_max_f16_e64 v3, |v0|, |v1| +; GFX8-SDAG-NEXT: v_min_f16_e64 v0, |v0|, |v1| +; GFX8-SDAG-NEXT: v_max_f16_e64 v0, v0, |v2| +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_fabs: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_min_f16_e64 v3, |v0|, |v1| +; GFX8-GISEL-NEXT: v_max_f16_e64 v0, |v0|, |v1| +; GFX8-GISEL-NEXT: v_max_f16_e64 v1, v3, |v2| +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmed3_f32_fpext_f16_fabs: ; GFX9: ; %bb.0: @@ -492,15 +564,23 @@ ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: fmed3_f32_fpext_f16_fneg: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; GFX8-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; GFX8-NEXT: v_cvt_f32_f16_e64 v2, -v2 -; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_fneg: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_max_f16_e64 v3, -v0, -v1 +; GFX8-SDAG-NEXT: v_min_f16_e64 v0, -v0, -v1 +; GFX8-SDAG-NEXT: v_max_f16_e64 v0, v0, -v2 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_fneg: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_min_f16_e64 v3, -v0, -v1 +; GFX8-GISEL-NEXT: v_max_f16_e64 v0, -v0, -v1 +; GFX8-GISEL-NEXT: v_max_f16_e64 v1, v3, -v2 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmed3_f32_fpext_f16_fneg: ; GFX9: ; %bb.0: @@ -596,15 +676,23 @@ ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: fmed3_f32_fpext_f16_fneg_fabs: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; GFX8-NEXT: v_cvt_f32_f16_e64 v1, -|v1| -; GFX8-NEXT: v_cvt_f32_f16_e64 v2, -|v2| -; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_fneg_fabs: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_max_f16_e64 v3, -|v0|, -|v1| +; GFX8-SDAG-NEXT: v_min_f16_e64 v0, -|v0|, -|v1| +; GFX8-SDAG-NEXT: v_max_f16_e64 v0, v0, -|v2| +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_fneg_fabs: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_min_f16_e64 v3, -|v0|, -|v1| +; GFX8-GISEL-NEXT: v_max_f16_e64 v0, -|v0|, -|v1| +; GFX8-GISEL-NEXT: v_max_f16_e64 v1, v3, -|v2| +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmed3_f32_fpext_f16_fneg_fabs: ; GFX9: ; %bb.0: @@ -715,11 +803,10 @@ ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16: @@ -773,17 +860,29 @@ ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: fmed3_f32_fpext_f16_multi_use_0: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GFX8-NEXT: flat_store_dword v[3:4], v5 -; GFX8-NEXT: v_med3_f32 v0, v5, v0, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_0: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX8-SDAG-NEXT: flat_store_dword v[3:4], v5 +; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0 +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_0: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX8-GISEL-NEXT: flat_store_dword v[3:4], v5 +; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmed3_f32_fpext_f16_multi_use_0: ; GFX9: ; %bb.0: @@ -828,17 +927,29 @@ ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: fmed3_f32_fpext_f16_multi_use_1: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-NEXT: flat_store_dword v[3:4], v1 -; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_1: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX8-SDAG-NEXT: flat_store_dword v[3:4], v5 +; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0 +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_1: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX8-GISEL-NEXT: flat_store_dword v[3:4], v5 +; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmed3_f32_fpext_f16_multi_use_1: ; GFX9: ; %bb.0: @@ -883,17 +994,29 @@ ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: fmed3_f32_fpext_f16_multi_use_2: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: flat_store_dword v[3:4], v2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_2: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX8-SDAG-NEXT: flat_store_dword v[3:4], v5 +; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v3, v0 +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_2: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX8-GISEL-NEXT: flat_store_dword v[3:4], v5 +; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmed3_f32_fpext_f16_multi_use_2: ; GFX9: ; %bb.0: @@ -944,11 +1067,10 @@ ; GFX8-GISEL-LABEL: fmed3_f32_fpext_bf16: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: fmed3_f32_fpext_bf16: @@ -1006,11 +1128,10 @@ ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_bf16_0: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: fmed3_f32_fpext_f16_bf16_0: @@ -1070,11 +1191,10 @@ ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_bf16_1: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: fmed3_f32_fpext_f16_bf16_1: @@ -1134,11 +1254,10 @@ ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_bf16_2: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX8-GISEL-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-GISEL-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v3, v2 +; GFX8-GISEL-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: fmed3_f32_fpext_f16_bf16_2: