Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -835,17 +835,12 @@ // // NOTE: rcp is the preference in cases that both are legal. bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { - Type *Ty = FDiv.getType()->getScalarType(); - - // The f64 rcp/rsq approximations are pretty inaccurate. We can do an - // expansion around them in codegen. - if (Ty->isDoubleTy()) + if (!Ty->isFloatTy()) return false; - // No intrinsic for fdiv16 if target does not support f16. - if (Ty->isHalfTy() && !ST->has16BitInsts()) - return false; + // The f64 rcp/rsq approximations are pretty inaccurate. We can do an + // expansion around them in codegen. f16 is good enough to always use. const FPMathOperator *FPOp = cast(&FDiv); const float ReqdAccuracy = FPOp->getFPAccuracy(); @@ -854,11 +849,10 @@ FastMathFlags FMF = FPOp->getFastMathFlags(); const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc(); - // rcp_f16 is accurate for !fpmath >= 1.0ulp. + // rcp_f16 is accurate to 0.51 ulp. // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. // rcp_f64 is never accurate. - const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) || - (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f); + const bool RcpIsAccurate = !HasFP32Denormals && ReqdAccuracy >= 1.0f; IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); Builder.setFastMathFlags(FMF); Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4111,13 +4111,20 @@ LLT ResTy = MRI.getType(Res); const MachineFunction &MF = B.getMF(); - bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || - MI.getFlag(MachineInstr::FmAfn); - - if (!AllowInaccurateRcp) - return false; + bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || + MF.getTarget().Options.UnsafeFPMath; if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { + if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) + return false; + + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + // + // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. + // 1 / x -> RCP(x) if (CLHS->isExactlyValue(1.0)) { B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) @@ -4128,6 +4135,8 @@ return true; } + // TODO: Match rsq + // -1 / x -> RCP( FNEG(x) ) if (CLHS->isExactlyValue(-1.0)) { auto FNeg = B.buildFNeg(ResTy, RHS, Flags); @@ -4140,6 +4149,12 @@ } } + // For f16 require arcp only. + // For f32 require afn+arcp. + if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) || + !MI.getFlag(MachineInstr::FmArcp))) + return false; + // x / y -> x * (1.0 / y) auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) .addUse(RHS) Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9130,26 +9130,30 @@ EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool AllowInaccurateRcp = Flags.hasApproximateFuncs(); - - // Without !fpmath accuracy information, we can't do more because we don't - // know exactly whether rcp is accurate enough to meet !fpmath requirement. - if (!AllowInaccurateRcp) - return SDValue(); + bool AllowInaccurateRcp = Flags.hasApproximateFuncs() || + DAG.getTarget().Options.UnsafeFPMath; if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { + // Without !fpmath accuracy information, we can't do more because we don't + // know exactly whether rcp is accurate enough to meet !fpmath requirement. + // f16 is always accurate enough + if (!AllowInaccurateRcp && VT != MVT::f16) + return SDValue(); + if (CLHS->isExactlyValue(1.0)) { // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to // the CI documentation has a worst case error of 1 ulp. // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to // use it as long as we aren't trying to use denormals. // - // v_rcp_f16 and v_rsq_f16 DO support denormals. + // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. // 1.0 / sqrt(x) -> rsq(x) // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP // error seems really high at 2^29 ULP. + + // XXX - do we need afn for this or is arcp sufficent? if (RHS.getOpcode() == ISD::FSQRT) return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); @@ -9165,6 +9169,11 @@ } } + // For f16 require arcp only. + // For f32 require afn+arcp. + if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal())) + return SDValue(); + // Turn into multiply by the reciprocal. // x / y -> x * (1.0 / y) SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -254,36 +254,14 @@ ; GFX89-LABEL: v_neg_rcp_f16: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, -1.0 -; GFX89-NEXT: v_rcp_f32_e32 v1, v1 -; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX89-NEXT: v_rcp_f16_e64 v0, -v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_neg_rcp_f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, -1.0 -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_neg_rcp_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, -1.0 -; GFX11-NEXT: v_rcp_f32_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_neg_rcp_f16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: v_rcp_f16_e64 v0, -v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half -1.0, %x ret half %fdiv } @@ -333,36 +311,14 @@ ; GFX89-LABEL: v_rcp_f16: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX89-NEXT: v_rcp_f32_e32 v1, v1 -; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX89-NEXT: v_rcp_f16_e32 v0, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_rcp_f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_rcp_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX11-NEXT: v_rcp_f32_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_rcp_f16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: v_rcp_f16_e32 v0, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half 1.0, %x ret half %fdiv } @@ -412,36 +368,14 @@ ; GFX89-LABEL: v_rcp_f16_arcp: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX89-NEXT: v_rcp_f32_e32 v1, v1 -; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX89-NEXT: v_rcp_f16_e32 v0, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_rcp_f16_arcp: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_rcp_f16_arcp: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX11-NEXT: v_rcp_f32_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_rcp_f16_arcp: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: v_rcp_f16_e32 v0, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp half 1.0, %x ret half %fdiv } @@ -610,35 +544,23 @@ ; GFX89-LABEL: v_fdiv_f16_arcp_ulp25: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX89-NEXT: v_rcp_f32_e32 v2, v2 -; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX89-NEXT: v_rcp_f16_e32 v1, v1 +; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f16_arcp_ulp25: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX10-NEXT: v_rcp_f16_e32 v1, v1 +; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f16_arcp_ulp25: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp half %a, %b, !fpmath !0 ret half %fdiv @@ -720,19 +642,19 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-NEXT: v_rcp_f32_e32 v2, v2 -; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX8-NEXT: v_rcp_f32_e32 v3, v3 +; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX8-NEXT: v_rcp_f32_e32 v5, v5 -; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5 +; GFX8-NEXT: v_mul_f32_e32 v3, v6, v3 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6 +; GFX8-NEXT: v_mul_f32_e32 v5, v7, v5 +; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX8-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX8-NEXT: v_div_fixup_f16 v1, v5, v4, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -741,19 +663,19 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 +; GFX9-NEXT: v_mul_f32_e32 v3, v6, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 +; GFX9-NEXT: v_mul_f32_e32 v5, v7, v5 +; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX9-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX9-NEXT: v_div_fixup_f16 v1, v5, v4, v2 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -938,19 +860,19 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-NEXT: v_rcp_f32_e32 v2, v2 -; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX8-NEXT: v_rcp_f32_e32 v3, v3 +; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX8-NEXT: v_rcp_f32_e32 v5, v5 -; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5 +; GFX8-NEXT: v_mul_f32_e32 v3, v6, v3 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6 +; GFX8-NEXT: v_mul_f32_e32 v5, v7, v5 +; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX8-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX8-NEXT: v_div_fixup_f16 v1, v5, v4, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -959,19 +881,19 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 +; GFX9-NEXT: v_mul_f32_e32 v3, v6, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 +; GFX9-NEXT: v_mul_f32_e32 v5, v7, v5 +; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX9-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX9-NEXT: v_div_fixup_f16 v1, v5, v4, v2 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1716,74 +1638,43 @@ ; GFX8-LABEL: v_rcp_v2f16_arcp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX8-NEXT: v_rcp_f32_e32 v1, v1 -; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_rcp_f16_e32 v1, v0 +; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x3c00 +; GFX8-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f16_arcp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-NEXT: v_rcp_f16_e32 v1, v0 +; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; GFX9-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16_arcp: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 -; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-NEXT: v_rcp_f16_e32 v1, v0 +; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; GFX10-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_rcp_v2f16_arcp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX11-NEXT: v_rcp_f32_e32 v2, v2 -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_rcp_f16_e32 v0, v0 +; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX11-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x half> , %x @@ -1809,7 +1700,10 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_rcp_f16_e32 v1, v0 -; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x3c00 +; GFX8-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1818,6 +1712,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v1, v0 ; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; GFX9-NEXT: v_mul_f16_e32 v0, 1.0, v0 ; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1826,6 +1722,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v1, v0 ; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; GFX10-NEXT: v_mul_f16_e32 v0, 1.0, v0 ; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1836,6 +1734,8 @@ ; GFX11-NEXT: v_rcp_f16_e32 v0, v0 ; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn <2 x half> , %x @@ -1916,34 +1816,74 @@ ; GFX8-LABEL: v_rcp_v2f16_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_e32 v1, v0 -; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_rcp_f32_e32 v3, v3 +; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f16_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v1, v0 -; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16_ulp25: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_rcp_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_rcp_v2f16_ulp25: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_rcp_f16_e32 v0, v0 -; GFX11-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x, !fpmath !0 @@ -2087,81 +2027,43 @@ ; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-NEXT: v_rcp_f32_e32 v2, v2 -; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX8-NEXT: v_rcp_f32_e32 v5, v5 -; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5 -; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_rcp_f16_e32 v2, v1 +; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-NEXT: v_rcp_f16_e32 v2, v1 +; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-NEXT: v_rcp_f16_e32 v2, v1 +; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX11-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_rcp_f16_e32 v2, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v3, v6, v3 :: v_dual_mul_f32 v4, v7, v4 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5 +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0 @@ -2357,36 +2259,23 @@ ; ; GFX89-LABEL: s_fdiv_f16_arcp: ; GFX89: ; %bb.0: -; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX89-NEXT: v_rcp_f32_e32 v0, v0 -; GFX89-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX89-NEXT: v_mov_b32_e32 v1, s1 -; GFX89-NEXT: v_div_fixup_f16 v0, v0, v1, s0 +; GFX89-NEXT: v_rcp_f16_e32 v0, s1 +; GFX89-NEXT: v_mul_f16_e32 v0, s0, v0 ; GFX89-NEXT: v_readfirstlane_b32 s0, v0 ; GFX89-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fdiv_f16_arcp: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX10-NEXT: v_rcp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0 +; GFX10-NEXT: v_rcp_f16_e32 v0, s1 +; GFX10-NEXT: v_mul_f16_e32 v0, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fdiv_f16_arcp: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX11-NEXT: v_rcp_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f16_e32 v0, s1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0 +; GFX11-NEXT: v_mul_f16_e32 v0, s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %a = bitcast i16 %a.arg to half @@ -2518,21 +2407,21 @@ ; GFX8-LABEL: s_fdiv_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX8-NEXT: s_lshr_b32 s2, s1, 16 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, s2 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, s0 ; GFX8-NEXT: v_rcp_f32_e32 v0, v0 -; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, s3 -; GFX8-NEXT: v_rcp_f32_e32 v2, v2 -; GFX8-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, s2 +; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, v3, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_div_fixup_f16 v0, v0, v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_div_fixup_f16 v1, v1, v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_div_fixup_f16 v1, v1, v2, s2 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -2541,21 +2430,21 @@ ; GFX9-LABEL: s_fdiv_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, s2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s3, s1, 16 +; GFX9-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: v_cvt_f32_f16_e32 v2, s0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, s3 -; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, s2 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v3, v2 +; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_div_fixup_f16 v0, v0, v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_div_fixup_f16 v1, v1, v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_div_fixup_f16 v1, v1, v2, s2 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -2650,35 +2539,20 @@ ; ; GFX89-LABEL: s_rcp_f16: ; GFX89: ; %bb.0: -; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, 1.0 -; GFX89-NEXT: v_rcp_f32_e32 v0, v0 -; GFX89-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX89-NEXT: v_div_fixup_f16 v0, v0, s0, 1.0 +; GFX89-NEXT: v_rcp_f16_e32 v0, s0 ; GFX89-NEXT: v_readfirstlane_b32 s0, v0 ; GFX89-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_rcp_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, 1.0 -; GFX10-NEXT: v_rcp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_div_fixup_f16 v0, v0, s0, 1.0 +; GFX10-NEXT: v_rcp_f16_e32 v0, s0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_rcp_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, 1.0 -; GFX11-NEXT: v_rcp_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f16_e32 v0, s0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_div_fixup_f16 v0, v0, s0, 1.0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %a = bitcast i16 %a.arg to half @@ -2731,35 +2605,20 @@ ; ; GFX89-LABEL: s_neg_rcp_f16: ; GFX89: ; %bb.0: -; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, -1.0 -; GFX89-NEXT: v_rcp_f32_e32 v0, v0 -; GFX89-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX89-NEXT: v_div_fixup_f16 v0, v0, s0, -1.0 +; GFX89-NEXT: v_rcp_f16_e64 v0, -s0 ; GFX89-NEXT: v_readfirstlane_b32 s0, v0 ; GFX89-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_neg_rcp_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, -1.0 -; GFX10-NEXT: v_rcp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_div_fixup_f16 v0, v0, s0, -1.0 +; GFX10-NEXT: v_rcp_f16_e64 v0, -s0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_neg_rcp_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, -1.0 -; GFX11-NEXT: v_rcp_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f16_e64 v0, -s0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_div_fixup_f16 v0, v0, s0, -1.0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %a = bitcast i16 %a.arg to half @@ -2818,39 +2677,20 @@ ; ; GFX89-LABEL: s_rsq_f16: ; GFX89: ; %bb.0: -; GFX89-NEXT: v_sqrt_f16_e32 v0, s0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX89-NEXT: v_rcp_f32_e32 v1, v1 -; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX89-NEXT: v_rsq_f16_e32 v0, s0 ; GFX89-NEXT: v_readfirstlane_b32 s0, v0 ; GFX89-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_rsq_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_sqrt_f16_e32 v0, s0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX10-NEXT: v_rsq_f16_e32 v0, s0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_rsq_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_sqrt_f16_e32 v0, s0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX11-NEXT: v_rsq_f16_e32 v0, s0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %a = bitcast i16 %a.arg to half @@ -2952,23 +2792,20 @@ ; ; GFX8-LABEL: s_rsq_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_sqrt_f16_e32 v0, s0 -; GFX8-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: v_sqrt_f16_e32 v1, s0 ; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 -; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0 +; GFX8-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX8-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -3093,40 +2930,14 @@ ; GFX89-LABEL: v_rsq_f16: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX89-NEXT: v_rcp_f32_e32 v1, v1 -; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX89-NEXT: v_rsq_f16_e32 v0, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_rsq_f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_rsq_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_rsq_f16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %sqrt = call half @llvm.sqrt.f16(half %a) %fdiv = fdiv half 1.0, %sqrt ret half %fdiv @@ -3184,38 +2995,22 @@ ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, -1.0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX89-NEXT: v_rcp_f32_e32 v1, v1 -; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX89-NEXT: v_rcp_f16_e64 v0, -v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_neg_rsq_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, -1.0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX10-NEXT: v_rcp_f16_e64 v0, -v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_neg_rsq_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, -1.0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX11-NEXT: v_rcp_f16_e64 v0, -v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %sqrt = call half @llvm.sqrt.f16(half %a) %fdiv = fdiv half -1.0, %sqrt @@ -3274,38 +3069,22 @@ ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-NEXT: v_sqrt_f16_e64 v0, |v0| -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, -1.0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX89-NEXT: v_rcp_f32_e32 v1, v1 -; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX89-NEXT: v_rcp_f16_e64 v0, -v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_neg_rsq_f16_fabs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sqrt_f16_e64 v0, |v0| -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, -1.0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX10-NEXT: v_rcp_f16_e64 v0, -v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_neg_rsq_f16_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_sqrt_f16_e64 v0, |v0| -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, -1.0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX11-NEXT: v_rcp_f16_e64 v0, -v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %sqrt = call half @llvm.sqrt.f16(half %a.fabs) @@ -3364,40 +3143,14 @@ ; GFX89-LABEL: v_rsq_f16_arcp: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX89-NEXT: v_rcp_f32_e32 v1, v1 -; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX89-NEXT: v_rsq_f16_e32 v0, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_rsq_f16_arcp: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_rsq_f16_arcp: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_rsq_f16_arcp: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %sqrt = call half @llvm.sqrt.f16(half %a) %fdiv = fdiv arcp half 1.0, %sqrt ret half %fdiv @@ -3455,38 +3208,22 @@ ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, -1.0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX89-NEXT: v_rcp_f32_e32 v1, v1 -; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX89-NEXT: v_rcp_f16_e64 v0, -v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_neg_rsq_f16_arcp: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, -1.0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX10-NEXT: v_rcp_f16_e64 v0, -v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_neg_rsq_f16_arcp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, -1.0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX11-NEXT: v_rcp_f16_e64 v0, -v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %sqrt = call half @llvm.sqrt.f16(half %a) %fdiv = fdiv arcp half -1.0, %sqrt @@ -3611,22 +3348,20 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sqrt_f16_e32 v1, v0 -; GFX8-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 +; GFX8-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rsq_v2f16: @@ -3781,22 +3516,20 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sqrt_f16_e32 v1, v0 -; GFX8-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 -; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 +; GFX8-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_neg_rsq_v2f16: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir @@ -2055,30 +2055,18 @@ ; VI-LABEL: name: test_fdiv_s16_constant_one_rcp ; VI: liveins: $vgpr0 ; VI-NEXT: {{ $}} - ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) - ; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) - ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) - ; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) - ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) + ; VI-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) + ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: test_fdiv_s16_constant_one_rcp ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) - ; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) - ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) - ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) + ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_one_rcp ; GFX9-UNSAFE: liveins: $vgpr0 @@ -2091,16 +2079,10 @@ ; GFX10-LABEL: name: test_fdiv_s16_constant_one_rcp ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) - ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) - ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) - ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) - ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) + ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) + ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s16) = G_FCONSTANT half 1.0 %1:_(s32) = COPY $vgpr0 @@ -2143,30 +2125,20 @@ ; VI-LABEL: name: test_fdiv_s16_constant_negative_one_rcp ; VI: liveins: $vgpr0 ; VI-NEXT: {{ $}} - ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00 ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) - ; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) - ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) - ; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) - ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) + ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]] + ; VI-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) + ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: test_fdiv_s16_constant_negative_one_rcp ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00 ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) - ; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) - ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) - ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) + ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]] + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) + ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_negative_one_rcp ; GFX9-UNSAFE: liveins: $vgpr0 @@ -2180,16 +2152,11 @@ ; GFX10-LABEL: name: test_fdiv_s16_constant_negative_one_rcp ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00 ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) - ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) - ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) - ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] - ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) - ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) - ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) + ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]] + ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) + ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s16) = G_FCONSTANT half -1.0 %1:_(s32) = COPY $vgpr0 Index: llvm/test/CodeGen/AMDGPU/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -92,10 +92,7 @@ ; GCN-LABEL: {{^}}reciprocal_f16_rounded: ; GFX8PLUS: {{flat|global}}_load_{{ushort|u16}} [[VAL16:v[0-9]+]], v{{.+}} -; GFX8PLUS: v_cvt_f32_f16_e32 [[CVT_TO32:v[0-9]+]], [[VAL16]] -; GFX8PLUS: v_rcp_f32_e32 [[RCP32:v[0-9]+]], [[CVT_TO32]] -; GFX8PLUS: v_cvt_f16_f32_e32 [[CVT_BACK16:v[0-9]+]], [[RCP32]] -; GFX8PLUS: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK16]], [[VAL16]], 1.0 +; GFX8PLUS: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL16]] ; GFX8PLUS: {{flat|global}}_store_{{short|b16}} v{{.+}}, [[RESULT]] define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { entry: @@ -269,8 +266,8 @@ ; SI: v_rcp_f32 ; SI: v_mul_f32 -; GFX8PLUS: v_rcp_f32 -; GFX8PLUS: v_mul_f32 +; GFX8PLUS: v_rcp_f16 +; GFX8PLUS: v_mul_f16 define half @v_fdiv_f16_arcp(half %x, half %y) { %fdiv = fdiv arcp half %x, %y ret half %fdiv Index: llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -527,69 +527,54 @@ ; ; VI-SAFE-LABEL: fneg_fadd_0_f16: ; VI-SAFE: ; %bb.0: ; %.entry -; VI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, s1 -; VI-SAFE-NEXT: v_mov_b32_e32 v2, s0 -; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00 -; VI-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; VI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-SAFE-NEXT: v_div_fixup_f16 v0, v0, s1, 1.0 +; VI-SAFE-NEXT: v_rcp_f16_e32 v0, s1 +; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0 ; VI-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0 ; VI-SAFE-NEXT: v_add_f16_e32 v0, 0, v0 -; VI-SAFE-NEXT: v_xor_b32_e32 v3, 0x8000, v0 +; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v0 ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, s0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00 ; VI-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 ; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; VI-SAFE-NEXT: ; return to shader part epilog ; ; VI-NSZ-LABEL: fneg_fadd_0_f16: ; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, s1 -; VI-NSZ-NEXT: v_mov_b32_e32 v2, s0 -; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00 -; VI-NSZ-NEXT: v_rcp_f32_e32 v0, v0 -; VI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NSZ-NEXT: v_div_fixup_f16 v0, v0, s1, 1.0 +; VI-NSZ-NEXT: v_rcp_f16_e32 v0, s1 +; VI-NSZ-NEXT: v_mov_b32_e32 v1, s0 ; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 ; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00 ; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 ; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; VI-NSZ-NEXT: ; return to shader part epilog ; ; GFX11-SAFE-LABEL: fneg_fadd_0_f16: ; GFX11-SAFE: ; %bb.0: ; %.entry -; GFX11-SAFE-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GFX11-SAFE-NEXT: v_rcp_f16_e32 v0, s1 ; GFX11-SAFE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-SAFE-NEXT: v_div_fixup_f16 v0, v0, s1, 1.0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_add_f16_e32 v0, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0 ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 ; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo ; GFX11-SAFE-NEXT: ; return to shader part epilog ; ; GFX11-NSZ-LABEL: fneg_fadd_0_f16: ; GFX11-NSZ: ; %bb.0: ; %.entry -; GFX11-NSZ-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_rcp_f32_e32 v0, v0 +; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1 ; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NSZ-NEXT: v_div_fixup_f16 v0, v0, s1, 1.0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 -; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 ; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 ; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo ; GFX11-NSZ-NEXT: ; return to shader part epilog