Index: llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -728,6 +728,11 @@ bool matchCombineFAddFpExtFMulToFMadOrFMAAggressive(MachineInstr &MI, BuildFnTy &MatchInfo); + // Transform (fptrunc (fmul x, y))) -> (fma x, y, 0) + // (fptrunc (fmul x, y))) -> (fmad x, y, 0) + bool matchCombineFpTruncFMulToFMadOrFMA(MachineInstr &MI, + BuildFnTy &MatchInfo); + /// Transform (fsub (fmul x, y), z) -> (fma x, y, -z) /// (fsub (fmul x, y), z) -> (fmad x, y, -z) bool matchCombineFSubFMulToFMadOrFMA(MachineInstr &MI, BuildFnTy &MatchInfo); Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -3021,6 +3021,18 @@ return isFPExtFree(DestVT, SrcVT); } + virtual bool isFPTruncFoldable(const MachineInstr &MI, unsigned Opcode, + LLT DestTy, LLT SrcTy) const { + return false; + } + + virtual bool isFPTruncFoldable(const SelectionDAG &DAG, unsigned Opcode, + EVT DestVT, EVT SrcVT) const { + assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() && + "invalid fpext types"); + return isFPExtFree(DestVT, SrcVT); + } + /// Return true if folding a vector load into ExtVal (a sign, zero, or any /// extend node) is profitable. virtual bool isVectorLoadExtDesirable(SDValue ExtVal) const { return false; } Index: llvm/include/llvm/Target/GlobalISel/Combine.td =================================================================== --- llvm/include/llvm/Target/GlobalISel/Combine.td +++ llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1003,6 +1003,16 @@ *${root}, ${info}); }]), (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>; +// Transform (fptrunc (fmul x, y)) -> +// (fma x, y, 0) +def combine_fptrunc_fmul_to_fmad_or_fma: GICombineRule< + (defs root:$dst, build_fn_matchinfo:$info), + (match (G_FPTRUNC $dst, $mul):$mi, (G_FMUL $mul, $src0, $src1), + [{ return Helper.matchCombineFpTruncFMulToFMadOrFMA(*${mi}, + ${info}); }]), + (apply [{ Helper.applyBuildFn(*${mi}, ${info}); }])>; + + def combine_minmax_nan: GICombineRule< (defs root:$root, unsigned_matchinfo:$info), (match (wip_match_opcode G_FMINNUM, G_FMAXNUM, G_FMINIMUM, G_FMAXIMUM):$root, @@ -1115,7 +1125,8 @@ combine_fadd_fpext_fmul_to_fmad_or_fma, combine_fadd_fma_fmul_to_fmad_or_fma, combine_fadd_fpext_fma_fmul_to_fmad_or_fma, combine_fsub_fmul_to_fmad_or_fma, combine_fsub_fneg_fmul_to_fmad_or_fma, combine_fsub_fpext_fmul_to_fmad_or_fma, - combine_fsub_fpext_fneg_fmul_to_fmad_or_fma]>; + combine_fsub_fpext_fneg_fmul_to_fmad_or_fma, + combine_fptrunc_fmul_to_fmad_or_fma]>; def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines, extract_vec_elt_combines, combines_for_extload, Index: llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -5334,6 +5334,43 @@ return false; } +bool CombinerHelper::matchCombineFpTruncFMulToFMadOrFMA( + MachineInstr &MI, std::function &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC); + const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering(); + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + + auto *FMulMI = getDefIgnoringCopies(SrcReg, MRI); + bool AllowFusionGlobally, HasFMAD, Aggressive; + if (!canCombineFMadOrFMA(*FMulMI, AllowFusionGlobally, HasFMAD, Aggressive)) + return false; + + unsigned PreferredFusedOpcode = + HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA; + + // fold (fptrunc (fmul x, y)), z) -> fptrunc (fma x, y, 0) + if (isContractableFMul(*FMulMI, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(SrcReg)) && + TLI.isFPTruncFoldable(MI, PreferredFusedOpcode, + MRI.getType(DstReg), + MRI.getType(SrcReg))) { + MatchInfo = [=, &MI](MachineIRBuilder &B) { + auto Tmp = MRI.createGenericVirtualRegister(MRI.getType(SrcReg)); + B.buildInstr(PreferredFusedOpcode, {Tmp}, + {FMulMI->getOperand(1).getReg(), + FMulMI->getOperand(2).getReg(), + B.buildConstant(MRI.getType(SrcReg), 0).getReg(0)}); + B.buildFPTrunc(DstReg, Tmp); + }; + + return true; + } + + return false; +} + bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive( MachineInstr &MI, std::function &MatchInfo) { assert(MI.getOpcode() == TargetOpcode::G_FADD); Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -549,7 +549,7 @@ template SDValue visitFSUBForFMACombine(SDNode *N); SDValue visitFMULForFMADistributiveCombine(SDNode *N); - + SDValue visitFP_ROUNDForFMAMixCombine(SDNode *N); SDValue XformToShuffleWithZero(SDNode *N); bool reassociationCanBreakAddressingModePattern(unsigned Opc, const SDLoc &DL, @@ -15823,6 +15823,60 @@ return SDValue(); } +// Try to perform FMA combining of fptrunc (fmul x, y) -> fptrunc (fma/mad x, y, 0) +// which will later be selected into v_fma_mixlo/v_mad_mixlo instruction. +SDValue DAGCombiner::visitFP_ROUNDForFMAMixCombine(SDNode *N) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + EVT SrcVT = N0.getValueType(); + SDLoc SL(N); + const TargetOptions &Options = DAG.getTarget().Options; + + assert(N->getOpcode() == ISD::FP_ROUND && "Expected FP_ROUND Operation"); + + bool HasFMA = + TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && + (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); + + bool HasFMAD = LegalOperations && TLI.isFMADLegal(DAG, N0.getNode()); + + // No valid opcode, do not combine. + if (!HasFMAD && !HasFMA) + return SDValue(); + + // Always prefer FMAD to FMA for precision. + unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; + bool Aggressive = TLI.enableAggressiveFMAFusion(VT); + + const SDNodeFlags Flags = N0.getNode()->getFlags(); + bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath || HasFMAD); + + if (TLI.generateFMAsInMachineCombiner(VT, OptLevel)) + return SDValue(); + + // If the multiplication is not contractable, do not combine. + if (!AllowFusionGlobally && !Flags.hasAllowContract()) + return SDValue(); + + if (N0.getOpcode() != ISD::FMUL) + return SDValue(); + + // fold (fptrunc (fmul x, y)) -> (fma x, y, 0) + if (isContractableFMUL(Options, N0) && + (Aggressive || N0->hasOneUse()) && + (TLI.isFPTruncFoldable(DAG, PreferredFusedOpcode, VT, SrcVT))) { + return DAG.getNode(ISD::FP_ROUND, SL, VT, + DAG.getNode(PreferredFusedOpcode, SL, SrcVT, + N0.getOperand(0), + N0.getOperand(1), + DAG.getConstantFP(0.0, SL, SrcVT)), + N->getOperand(1)); + } + + return SDValue(); +} + SDValue DAGCombiner::visitVP_FADD(SDNode *N) { SelectionDAG::FlagInserter FlagsInserter(DAG, N); @@ -17075,6 +17129,12 @@ if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; + // FP_ROUND -> FMA combines: + if (SDValue Fused = visitFP_ROUNDForFMAMixCombine(N)) { + AddToWorklist(Fused.getNode()); + return Fused; + } + return SDValue(); } Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -285,6 +285,12 @@ bool isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, LLT DestTy, LLT SrcTy) const override; + bool isFPTruncFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, + EVT SrcVT) const override; + + bool isFPTruncFoldable(const MachineInstr &MI, unsigned Opcode, LLT DestTy, + LLT SrcTy) const override; + bool isShuffleMaskLegal(ArrayRef /*Mask*/, EVT /*VT*/) const override; // While address space 7 should never make it to codegen, it still needs to Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -846,6 +846,29 @@ !hasFP32Denormals(*MI.getMF()); } + +bool SITargetLowering::isFPTruncFoldable(const SelectionDAG &DAG, unsigned Opcode, + EVT DestVT, EVT SrcVT) const { + return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && + DestVT.getScalarType() == MVT::f16 && + SrcVT.getScalarType() == MVT::f32 && + // TODO: This probably only requires no input flushing? + !hasFP32Denormals(DAG.getMachineFunction()); +} + +bool SITargetLowering::isFPTruncFoldable(const MachineInstr &MI, unsigned Opcode, + LLT DestTy, LLT SrcTy) const { + return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) && + DestTy.getScalarSizeInBits() == 16 && + SrcTy.getScalarSizeInBits() == 32 && + // TODO: This probably only requires no input flushing? + !hasFP32Denormals(*MI.getMF()); +} + + + bool SITargetLowering::isShuffleMaskLegal(ArrayRef, EVT) const { // SI has some legal vector types, but no legal vector operations. Say no // shuffles are legal in order to prefer scalarizing some vector operations. Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -1,13 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; Denormal mode shouldn't matter for f16, check with and without flushing. + ; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6,GFX6-IEEE %s ; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6,GFX6-FLUSH %s ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-IEEE %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-FLUSH %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s @@ -57,16 +58,36 @@ ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX89-LABEL: v_fdiv_f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX89-NEXT: v_rcp_f32_e32 v2, v2 -; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: v_fdiv_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-IEEE-LABEL: v_fdiv_f16: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_fdiv_f16: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f16: ; GFX10: ; %bb.0: @@ -176,16 +197,36 @@ ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX89-LABEL: v_fdiv_f16_ulp25: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX89-NEXT: v_rcp_f32_e32 v2, v2 -; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: v_fdiv_f16_ulp25: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-IEEE-LABEL: v_fdiv_f16_ulp25: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_fdiv_f16_ulp25: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f16_ulp25: ; GFX10: ; %bb.0: @@ -257,16 +298,36 @@ ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX89-LABEL: v_rcp_f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX89-NEXT: v_rcp_f32_e32 v1, v1 -; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: v_rcp_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-IEEE-LABEL: v_rcp_f16: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_rcp_f16: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_f16: ; GFX10: ; %bb.0: @@ -338,16 +399,36 @@ ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX89-LABEL: v_rcp_f16_arcp: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX89-NEXT: v_rcp_f32_e32 v1, v1 -; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: v_rcp_f16_arcp: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-IEEE-LABEL: v_rcp_f16_arcp: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_rcp_f16_arcp: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_f16_arcp: ; GFX10: ; %bb.0: @@ -542,16 +623,36 @@ ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX89-LABEL: v_fdiv_f16_arcp_ulp25: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX89-NEXT: v_rcp_f32_e32 v2, v2 -; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: v_fdiv_f16_arcp_ulp25: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-IEEE-LABEL: v_fdiv_f16_arcp_ulp25: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_fdiv_f16_arcp_ulp25: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f16_arcp_ulp25: ; GFX10: ; %bb.0: @@ -674,25 +775,41 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fdiv_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_fdiv_v2f16: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v7, v5 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v4, v6 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_fdiv_v2f16: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, v0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, v5 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16: ; GFX10: ; %bb.0: @@ -896,25 +1013,41 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fdiv_v2f16_ulp25: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_fdiv_v2f16_ulp25: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v7, v5 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v4, v6 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_fdiv_v2f16_ulp25: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, v0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, v5 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_ulp25: ; GFX10: ; %bb.0: @@ -1051,23 +1184,38 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_rcp_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_rcp_v2f16: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_rcp_v2f16: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16: ; GFX10: ; %bb.0: @@ -1201,23 +1349,38 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_rcp_v2f16_arcp: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_rcp_v2f16_arcp: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_rcp_v2f16_arcp: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16_arcp: ; GFX10: ; %bb.0: @@ -1582,25 +1745,41 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v7, v5 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v4, v6 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, v0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, v5 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX10: ; %bb.0: Index: llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -2118,6 +2118,66 @@ ret <4 x half> %cvt.result } +define half @mixlo_fptrunc(float %x, float %y, half %z) #0 { +; GFX1100-LABEL: mixlo_fptrunc: +; GFX1100: ; %bb.0: ; %.entry +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, 0 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: mixlo_fptrunc: +; GFX900: ; %bb.0: ; %.entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, 0 +; GFX900-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: mixlo_fptrunc: +; GFX906: ; %bb.0: ; %.entry +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, 0 +; GFX906-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: mixlo_fptrunc: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: v_add_f16_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: mixlo_fptrunc: +; SDAG-CI: ; %bb.0: ; %.entry +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: mixlo_fptrunc: +; GISEL-CI: ; %bb.0: ; %.entry +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +.entry: + %mul = fmul reassoc nnan nsz arcp contract afn float %x, %y + %trunc = fptrunc float %mul to half + %add = fadd reassoc nnan nsz arcp contract afn half %trunc, %z + ret half %add +} + declare half @llvm.minnum.f16(half, half) #1 declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1 declare <3 x half> @llvm.minnum.v3f16(<3 x half>, <3 x half>) #1