Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -740,6 +740,11 @@ Type *Ty = FDiv.getType()->getScalarType(); + // The f64 rcp/rsq approximations are pretty inaccurate. We can do an + // expansion around them in codegen. + if (Ty->isDoubleTy()) + return false; + // No intrinsic for fdiv16 if target does not support f16. if (Ty->isHalfTy() && !ST->has16BitInsts()) return false; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -128,6 +128,8 @@ MachineIRBuilder &B) const; bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2752,9 +2752,6 @@ LLT S32 = LLT::scalar(32); LLT S64 = LLT::scalar(64); - if (legalizeFastUnsafeFDIV(MI, MRI, B)) - return true; - if (DstTy == S16) return legalizeFDIV16(MI, MRI, B); if (DstTy == S32) @@ -3092,9 +3089,49 @@ return true; } +bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + Register Res = MI.getOperand(0).getReg(); + Register X = MI.getOperand(1).getReg(); + Register Y = MI.getOperand(2).getReg(); + uint16_t Flags = MI.getFlags(); + LLT ResTy = MRI.getType(Res); + + const MachineFunction &MF = B.getMF(); + bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || + MI.getFlag(MachineInstr::FmAfn); + + if (!AllowInaccurateRcp) + return false; + + auto NegY = B.buildFNeg(ResTy, Y); + auto One = B.buildFConstant(ResTy, 1.0); + + auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) + .addUse(Y) + .setMIFlags(Flags); + + auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); + R = B.buildFMA(ResTy, Tmp0, R, R); + + auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); + R = B.buildFMA(ResTy, Tmp1, R, R); + + auto Ret = B.buildFMul(ResTy, X, R); + auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); + + B.buildFMA(Res, Tmp2, R, Ret); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + if (legalizeFastUnsafeFDIV(MI, MRI, B)) + return true; + Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); @@ -3157,6 +3194,9 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + if (legalizeFastUnsafeFDIV(MI, MRI, B)) + return true; + Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); @@ -3223,6 +3263,9 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + if (legalizeFastUnsafeFDIV64(MI, MRI, B)) + return true; + Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -92,6 +92,7 @@ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFastUnsafeFDIV64(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV16(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8213,6 +8213,33 @@ return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags); } +SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + EVT VT = Op.getValueType(); + const SDNodeFlags Flags = Op->getFlags(); + + bool AllowInaccurateDiv = Flags.hasApproximateFuncs() || + DAG.getTarget().Options.UnsafeFPMath; + if (!AllowInaccurateDiv) + return SDValue(); + + SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y); + SDValue One = DAG.getConstantFP(1.0, SL, VT); + + SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y); + SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One); + + R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R); + SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One); + R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R); + SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R); + SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X); + return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret); +} + static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags) { @@ -8441,8 +8468,8 @@ } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { - if (DAG.getTarget().Options.UnsafeFPMath) - return lowerFastUnsafeFDIV(Op, DAG); + if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG)) + return FastLowered; SDLoc SL(Op); SDValue X = Op.getOperand(0); Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -786,7 +786,6 @@ let OtherPredicates = [UnsafeFPMath] in { -//def : RcpPat; //defm : RsqPat; //defm : RsqPat; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll @@ -67,8 +67,14 @@ ; GCN-LABEL: v_fdiv_f64_afn: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_rcp_f64_e32 v[2:3], v[2:3] -; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GCN-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GCN-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GCN-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GCN-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GCN-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; GCN-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] +; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] ; GCN-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn double %a, %b ret double %fdiv @@ -245,7 +251,14 @@ ; GCN-LABEL: v_rcp_f64_arcp_afn: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_rcp_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; GCN-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GCN-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GCN-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GCN-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GCN-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; GCN-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn double 1.0, %x ret double %fdiv @@ -311,8 +324,14 @@ ; GCN-LABEL: v_fdiv_f64_afn_ulp25: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_rcp_f64_e32 v[2:3], v[2:3] -; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GCN-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GCN-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GCN-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GCN-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GCN-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; GCN-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] +; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] ; GCN-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn double %a, %b, !fpmath !0 ret double %fdiv @@ -471,10 +490,22 @@ ; GCN-LABEL: v_fdiv_v2f64_afn: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[4:5] -; GCN-NEXT: v_rcp_f64_e32 v[6:7], v[6:7] -; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GCN-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] +; GCN-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GCN-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GCN-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; GCN-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] +; GCN-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] +; GCN-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] +; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] +; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GCN-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x double> %a, %b ret <2 x double> %fdiv @@ -766,8 +797,22 @@ ; GCN-LABEL: v_rcp_v2f64_arcp_afn: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_rcp_f64_e32 v[0:1], v[0:1] -; GCN-NEXT: v_rcp_f64_e32 v[2:3], v[2:3] +; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[0:1] +; GCN-NEXT: v_rcp_f64_e32 v[6:7], v[2:3] +; GCN-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; GCN-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; GCN-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; GCN-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; GCN-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; GCN-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; GCN-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; GCN-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; GCN-NEXT: v_mul_f64 v[8:9], 1.0, v[4:5] +; GCN-NEXT: v_mul_f64 v[10:11], 1.0, v[6:7] +; GCN-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0 +; GCN-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0 +; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; GCN-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn <2 x double> , %x ret <2 x double> %fdiv @@ -871,10 +916,22 @@ ; GCN-LABEL: v_fdiv_v2f64_afn_ulp25: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[4:5] -; GCN-NEXT: v_rcp_f64_e32 v[6:7], v[6:7] -; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GCN-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] +; GCN-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GCN-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GCN-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; GCN-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] +; GCN-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] +; GCN-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] +; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] +; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GCN-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv @@ -978,10 +1035,22 @@ ; GCN-LABEL: v_fdiv_v2f64_arcp_afn_ulp25: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_rcp_f64_e32 v[4:5], v[4:5] -; GCN-NEXT: v_rcp_f64_e32 v[6:7], v[6:7] -; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GCN-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] +; GCN-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GCN-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GCN-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GCN-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GCN-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GCN-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GCN-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; GCN-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] +; GCN-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] +; GCN-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] +; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] +; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GCN-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn arcp <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv Index: llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -410,10 +410,16 @@ ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; CI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; CI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1] +; CI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1] +; CI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3] +; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] ; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3] ; CI-NEXT: v_mov_b32_e32 v2, s4 @@ -429,10 +435,16 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1] +; VI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1] +; VI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3] +; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] ; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] ; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3] ; VI-NEXT: v_mov_b32_e32 v2, s4 @@ -455,10 +467,16 @@ ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; CI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; CI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1] +; CI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1] +; CI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3] +; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] ; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3] ; CI-NEXT: v_mov_b32_e32 v2, s4 @@ -474,10 +492,16 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1] +; VI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1] +; VI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3] +; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] ; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] ; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3] ; VI-NEXT: v_mov_b32_e32 v2, s4 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir @@ -467,9 +467,17 @@ ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64 ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY1]] + ; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 ; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s64) - ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[INT]] - ; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMUL]](s64) + ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]] + ; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]] + ; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]] + ; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]] + ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[FMA3]] + ; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[COPY]] + ; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]] + ; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMA5]](s64) ; GFX10-LABEL: name: test_fdiv_s64 ; GFX10: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 @@ -1140,11 +1148,26 @@ ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) ; GFX9-UNSAFE: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>) + ; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[UV2]] + ; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 ; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV2]](s64) - ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[UV]], [[INT]] + ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]] + ; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]] + ; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]] + ; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]] + ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[UV]], [[FMA3]] + ; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[UV]] + ; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]] + ; GFX9-UNSAFE: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[UV3]] ; GFX9-UNSAFE: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV3]](s64) - ; GFX9-UNSAFE: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[UV1]], [[INT1]] - ; GFX9-UNSAFE: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMUL]](s64), [[FMUL1]](s64) + ; GFX9-UNSAFE: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[INT1]], [[C]] + ; GFX9-UNSAFE: [[FMA7:%[0-9]+]]:_(s64) = G_FMA [[FMA6]], [[INT1]], [[INT1]] + ; GFX9-UNSAFE: [[FMA8:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA7]], [[C]] + ; GFX9-UNSAFE: [[FMA9:%[0-9]+]]:_(s64) = G_FMA [[FMA8]], [[FMA7]], [[FMA7]] + ; GFX9-UNSAFE: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[UV1]], [[FMA9]] + ; GFX9-UNSAFE: [[FMA10:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMUL1]], [[UV1]] + ; GFX9-UNSAFE: [[FMA11:%[0-9]+]]:_(s64) = G_FMA [[FMA10]], [[FMA9]], [[FMUL1]] + ; GFX9-UNSAFE: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMA5]](s64), [[FMA11]](s64) ; GFX9-UNSAFE: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX10-LABEL: name: test_fdiv_v2s64 ; GFX10: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 @@ -2312,9 +2335,18 @@ ; GFX9: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64) ; GFX9: $vgpr0_vgpr1 = COPY [[INT6]](s64) ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_one_rcp + ; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY]] ; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s64) - ; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[INT]](s64) + ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]] + ; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]] + ; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]] + ; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]] + ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[C]], [[FMA3]] + ; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]] + ; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]] + ; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMA5]](s64) ; GFX10-LABEL: name: test_fdiv_s64_constant_one_rcp ; GFX10: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 ; GFX10: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 @@ -2409,10 +2441,19 @@ ; GFX9: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64) ; GFX9: $vgpr0_vgpr1 = COPY [[INT6]](s64) ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_negative_one_rcp + ; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double -1.000000e+00 ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; GFX9-UNSAFE: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY]] - ; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s64) - ; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[INT]](s64) + ; GFX9-UNSAFE: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 + ; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s64) + ; GFX9-UNSAFE: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C1]] + ; GFX9-UNSAFE: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]] + ; GFX9-UNSAFE: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C1]] + ; GFX9-UNSAFE: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]] + ; GFX9-UNSAFE: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[C]], [[FMA3]] + ; GFX9-UNSAFE: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]] + ; GFX9-UNSAFE: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]] + ; GFX9-UNSAFE: $vgpr0_vgpr1 = COPY [[FMA5]](s64) ; GFX10-LABEL: name: test_fdiv_s64_constant_negative_one_rcp ; GFX10: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double -1.000000e+00 ; GFX10: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 Index: llvm/test/CodeGen/AMDGPU/fdiv.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.f64.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.f64.ll @@ -38,6 +38,35 @@ ret void } +; GCN-LABEL: {{^}}v_fdiv_f64_afn: +; GCN: v_rcp_f64_e32 v[4:5], v[2:3] +; GCN: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GCN: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GCN: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GCN: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GCN: v_mul_f64 v[6:7], v[0:1], v[4:5] +; GCN: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] +; GCN: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] +; GCN: s_setpc_b64 +define double @v_fdiv_f64_afn(double %x, double %y) #0 { + %result = fdiv afn double %x, %y + ret double %result +} + +; GCN-LABEL: {{^}}v_rcp_f64_afn: +; GCN: v_rcp_f64_e32 v[2:3], v[0:1] +; GCN: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GCN: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GCN: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GCN: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GCN: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; GCN: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; GCN: s_setpc_b64 +define double @v_rcp_f64_afn(double %x) #0 { + %result = fdiv afn double 1.0, %x + ret double %result +} + ; GCN-LABEL: {{^}}fdiv_f64_s_v: define amdgpu_kernel void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 { %den = load double, double addrspace(1)* %in Index: llvm/test/CodeGen/AMDGPU/frem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/frem.ll +++ llvm/test/CodeGen/AMDGPU/frem.ll @@ -711,41 +711,34 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; SI-LABEL: fast_frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s2, s14 +; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: s_mov_b32 s10, s14 +; SI-NEXT: s_mov_b32 s11, s15 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] -; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1] -; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] -; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9] -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v9 -; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc -; SI-NEXT: s_nop 1 -; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11] -; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] +; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 ; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s0, s14 ; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 ; SI-NEXT: v_not_b32_e32 v6, v6 ; SI-NEXT: v_and_b32_e32 v6, v4, v6 @@ -759,7 +752,7 @@ ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc ; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: fast_frem_f64: @@ -780,18 +773,14 @@ ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] -; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] -; CI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] -; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] -; CI-NEXT: s_nop 1 -; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] -; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] +; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 @@ -811,18 +800,14 @@ ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3] -; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] -; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 -; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3] -; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] -; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] -; VI-NEXT: s_nop 1 -; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] -; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3] +; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] +; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] +; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] +; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] +; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -855,7 +840,13 @@ ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; SI-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 ; SI-NEXT: s_mov_b32 s1, 0xfffff @@ -895,7 +886,13 @@ ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; CI-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5] +; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 @@ -916,7 +913,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; VI-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] +; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] +; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] +; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] +; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] +; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll @@ -107,9 +107,13 @@ } ; FUNC-LABEL: {{^}}unsafe_rcp_pat_f64: -; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}} -; SI-NOT: [[RESULT]] -; SI: buffer_store_dwordx2 [[RESULT]] +; SI: v_rcp_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 define amdgpu_kernel void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 { %rcp = fdiv double 1.0, %src store double %rcp, double addrspace(1)* %out, align 8 Index: llvm/test/CodeGen/AMDGPU/rsq.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/rsq.ll +++ llvm/test/CodeGen/AMDGPU/rsq.ll @@ -95,9 +95,15 @@ ; SI-SAFE: v_sqrt_f64_e32 ; SI-SAFE: v_div_scale_f64 -; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}} -; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]] -; SI-UNSAFE: buffer_store_dwordx2 [[RCP]] +; SI-UNSAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], [[VAL]] +; SI-UNSAFE: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], [[VAL]] +; SI-UNSAFE: v_fma_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}, [[RSQ]], 1.0 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 define amdgpu_kernel void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #0 { %val = load double, double addrspace(1)* %in, align 4 %sqrt = call double @llvm.sqrt.f64(double %val) @@ -127,9 +133,16 @@ ; SI-SAFE: v_sqrt_f64_e64 v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} ; SI-SAFE: v_div_scale_f64 -; SI-UNSAFE: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -v{{\[[0-9]+:[0-9]+\]}} -; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]] -; SI-UNSAFE: buffer_store_dwordx2 [[RCP]] +; SI-UNSAFE: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; SI-UNSAFE-DAG: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -[[VAL]] +; SI-UNSAFE-DAG: v_xor_b32_e32 v[[HI:[0-9]+]], 0x80000000, v{{[0-9]+}} +; SI-UNSAFE: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+}}:[[HI]]{{\]}} +; SI-UNSAFE: v_fma_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}, [[RSQ]], 1.0 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 +; SI-UNSAFE: v_fma_f64 define amdgpu_kernel void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #0 { %val = load double, double addrspace(1)* %in, align 4 %val.fneg = fsub double -0.0, %val