Index: llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -266,17 +266,20 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( MachineInstr &MI, std::function &MatchInfo) const { + auto getRcpSrc = [=](const MachineInstr &MI) -> MachineInstr * { + if (!MI.getFlag(MachineInstr::FmContract)) + return nullptr; - auto getRcpSrc = [=](const MachineInstr &MI) { - MachineInstr *ResMI = nullptr; if (auto *GI = dyn_cast(&MI)) { if (GI->is(Intrinsic::amdgcn_rcp)) - ResMI = MRI.getVRegDef(MI.getOperand(2).getReg()); + return MRI.getVRegDef(MI.getOperand(2).getReg()); } - return ResMI; + return nullptr; }; - auto getSqrtSrc = [=](const MachineInstr &MI) { + auto getSqrtSrc = [=](const MachineInstr &MI) -> MachineInstr * { + if (!MI.getFlag(MachineInstr::FmContract)) + return nullptr; MachineInstr *SqrtSrcMI = nullptr; auto Match = mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI))); @@ -304,7 +307,6 @@ }; return true; } - return false; } Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -212,6 +212,7 @@ SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFDivCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -762,6 +762,7 @@ ISD::USUBO_CARRY, ISD::FADD, ISD::FSUB, + ISD::FDIV, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINNUM_IEEE, @@ -11136,7 +11137,9 @@ N->getFlags()); } - if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) { + // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here. + if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) && + N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) { return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0), N->getFlags()); } @@ -12505,6 +12508,41 @@ return SDValue(); } +SDValue SITargetLowering::performFDivCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + EVT VT = N->getValueType(0); + if (VT != MVT::f16 || !Subtarget->has16BitInsts()) + return SDValue(); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + SDNodeFlags Flags = N->getFlags(); + SDNodeFlags RHSFlags = RHS->getFlags(); + if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() || + !RHS->hasOneUse()) + return SDValue(); + + if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { + bool IsNegative = false; + if (CLHS->isExactlyValue(1.0) || + (IsNegative = CLHS->isExactlyValue(-1.0))) { + // fdiv contract 1.0, (sqrt contract x) -> rsq for f16 + // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16 + if (RHS.getOpcode() == ISD::FSQRT) { + // TODO: Or in RHS flags, somehow missing from SDNodeFlags + SDValue Rsq = + DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags); + return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq; + } + } + } + + return SDValue(); +} + SDValue SITargetLowering::performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -12768,6 +12806,8 @@ return performFAddCombine(N, DCI); case ISD::FSUB: return performFSubCombine(N, DCI); + case ISD::FDIV: + return performFDivCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); case ISD::FMAXNUM: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll @@ -4,7 +4,8 @@ define amdgpu_cs float @div_sqrt(float inreg %arg1) { ; GCN-LABEL: div_sqrt: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_rsq_f32_e32 v0, s0 +; GCN-NEXT: v_sqrt_f32_e32 v0, s0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: ; return to shader part epilog .entry: %a = call float @llvm.sqrt.f32(float %arg1) @@ -15,7 +16,8 @@ define amdgpu_cs float @sqrt_div(float inreg %arg1) { ; GCN-LABEL: sqrt_div: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_rsq_f32_e32 v0, s0 +; GCN-NEXT: v_rcp_f32_e32 v0, s0 +; GCN-NEXT: v_sqrt_f32_e32 v0, v0 ; GCN-NEXT: ; return to shader part epilog .entry: %a = fdiv afn float 1.000000e+00, %arg1 @@ -26,7 +28,8 @@ define amdgpu_cs float @rcp_sqrt(float inreg %arg1) { ; GCN-LABEL: rcp_sqrt: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_rsq_f32_e32 v0, s0 +; GCN-NEXT: v_sqrt_f32_e32 v0, s0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: ; return to shader part epilog .entry: %a = call float @llvm.sqrt.f32(float %arg1) @@ -37,7 +40,8 @@ define amdgpu_cs float @sqrt_rcp(float inreg %arg1) { ; GCN-LABEL: sqrt_rcp: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_rsq_f32_e32 v0, s0 +; GCN-NEXT: v_rcp_f32_e32 v0, s0 +; GCN-NEXT: v_sqrt_f32_e32 v0, v0 ; GCN-NEXT: ; return to shader part epilog .entry: %a = call float @llvm.amdgcn.rcp.f32(float %arg1) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir @@ -11,7 +11,8 @@ ; GCN: liveins: $sgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 - ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32) + ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]] + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FSQRT]](s32) ; GCN-NEXT: $vgpr0 = COPY [[INT]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %0:_(s32) = COPY $sgpr0 @@ -53,8 +54,9 @@ ; GCN: liveins: $sgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 - ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32) - ; GCN-NEXT: $vgpr0 = COPY [[INT]](s32) + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s32) + ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[INT]] + ; GCN-NEXT: $vgpr0 = COPY [[FSQRT]](s32) ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %0:_(s32) = COPY $sgpr0 %2:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %0:_(s32) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -3282,14 +3282,24 @@ ; GFX89-LABEL: v_rsq_f16_missing_contract0: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_rsq_f16_e32 v0, v0 +; GFX89-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX89-NEXT: v_rcp_f16_e32 v0, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_rsq_f16_missing_contract0: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_rsq_f16_missing_contract0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX10-NEXT: v_rcp_f16_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rsq_f16_missing_contract0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_rcp_f16_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %sqrt = call half @llvm.sqrt.f16(half %a) %fdiv = fdiv contract half 1.0, %sqrt ret half %fdiv @@ -3346,14 +3356,24 @@ ; GFX89-LABEL: v_rsq_f16_missing_contract1: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_rsq_f16_e32 v0, v0 +; GFX89-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX89-NEXT: v_rcp_f16_e32 v0, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_rsq_f16_missing_contract1: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_rsq_f16_missing_contract1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX10-NEXT: v_rcp_f16_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rsq_f16_missing_contract1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_rcp_f16_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract half @llvm.sqrt.f16(half %a) %fdiv = fdiv half 1.0, %sqrt ret half %fdiv @@ -3767,14 +3787,24 @@ ; GFX89-LABEL: v_rsq_f16_afn_nocontract: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_rsq_f16_e32 v0, v0 +; GFX89-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX89-NEXT: v_rcp_f16_e32 v0, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_rsq_f16_afn_nocontract: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_rsq_f16_afn_nocontract: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX10-NEXT: v_rcp_f16_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rsq_f16_afn_nocontract: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_rcp_f16_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %sqrt = call half @llvm.sqrt.f16(half %a) %fdiv = fdiv afn half 1.0, %sqrt ret half %fdiv Index: llvm/test/CodeGen/AMDGPU/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -744,11 +744,11 @@ ; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX8-NEXT: v_rcp_f16_e64 v3, -v0 +; GFX8-NEXT: v_rsq_f16_e32 v3, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_store_short v[0:1], v3 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v3 +; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: v_rsq_f16_neg: @@ -758,8 +758,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sqrt_f16_e32 v1, v1 -; GFX9-NEXT: v_rcp_f16_e64 v1, -v1 +; GFX9-NEXT: v_rsq_f16_e32 v1, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -770,8 +770,8 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sqrt_f16_e32 v1, v1 -; GFX10-NEXT: v_rcp_f16_e64 v1, -v1 +; GFX10-NEXT: v_rsq_f16_e32 v1, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -782,9 +782,9 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 +; GFX11-NEXT: v_rsq_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_rcp_f16_e64 v1, -v1 +; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -952,7 +952,8 @@ ; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_rsq_f16_e32 v3, v0 +; GFX8-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX8-NEXT: v_rcp_f16_e32 v3, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_store_short v[0:1], v3 @@ -965,7 +966,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_rsq_f16_e32 v1, v1 +; GFX9-NEXT: v_sqrt_f16_e32 v1, v1 +; GFX9-NEXT: v_rcp_f16_e32 v1, v1 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -976,7 +978,8 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_rsq_f16_e32 v1, v1 +; GFX10-NEXT: v_sqrt_f16_e32 v1, v1 +; GFX10-NEXT: v_rcp_f16_e32 v1, v1 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -987,7 +990,9 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_rsq_f16_e32 v1, v1 +; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1048,7 +1053,8 @@ ; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_rsq_f16_e32 v3, v0 +; GFX8-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX8-NEXT: v_rcp_f16_e32 v3, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_store_short v[0:1], v3 @@ -1061,7 +1067,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_rsq_f16_e32 v1, v1 +; GFX9-NEXT: v_sqrt_f16_e32 v1, v1 +; GFX9-NEXT: v_rcp_f16_e32 v1, v1 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1072,7 +1079,8 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_rsq_f16_e32 v1, v1 +; GFX10-NEXT: v_sqrt_f16_e32 v1, v1 +; GFX10-NEXT: v_rcp_f16_e32 v1, v1 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1083,7 +1091,9 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_rsq_f16_e32 v1, v1 +; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2043,45 +2053,39 @@ ; GFX8-LABEL: v_neg_rsq_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX8-NEXT: v_rcp_f16_sdwa v1, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX8-NEXT: v_rcp_f16_e64 v0, -v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_rsq_f16_e32 v1, v0 +; GFX8-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x8000 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX8-NEXT: v_xor_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_neg_rsq_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX9-NEXT: v_rcp_f16_e64 v1, -v1 -; GFX9-NEXT: v_rcp_f16_e64 v0, -v0 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_rsq_f16_e32 v0, v0 +; GFX9-NEXT: v_pack_b32_f16 v0, -v0, -v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_neg_rsq_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX10-NEXT: v_rcp_f16_e64 v1, -v1 -; GFX10-NEXT: v_rcp_f16_e64 v0, -v0 -; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_rsq_f16_e32 v0, v0 +; GFX10-NEXT: v_pack_b32_f16 v0, -v0, -v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_neg_rsq_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX11-NEXT: v_rsq_f16_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_rcp_f16_e64 v0, -v0 -; GFX11-NEXT: v_rcp_f16_e64 v1, -v1 +; GFX11-NEXT: v_rsq_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: v_pack_b32_f16 v0, -v0, -v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX9-IEEE-LABEL: v_neg_rsq_v2f16: ; GFX9-IEEE: ; %bb.0: Index: llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll +++ llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll @@ -347,7 +347,8 @@ ; CHECK-LABEL: v_fdiv_recip_sqrt_f32_afn: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_rsq_f32_e32 v0, v0 +; CHECK-NEXT: v_sqrt_f32_e32 v0, v0 +; CHECK-NEXT: v_rcp_f32_e32 v0, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %sqrt = call afn float @llvm.sqrt.f32(float %x) %fdiv = fdiv afn float 1.0, %sqrt @@ -456,11 +457,57 @@ } define float @v_fdiv_recip_sqrt_f32_arcp_afn(float %x) { -; CHECK-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_rsq_f32_e32 v0, v0 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn: +; CODEGEN-IEEE-SDAG: ; %bb.0: +; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn: +; CODEGEN-IEEE-GISEL: ; %bb.0: +; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn: +; IR-IEEE-SDAG: ; %bb.0: +; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; IR-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn: +; IR-IEEE-GISEL: ; %bb.0: +; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn: +; CODEGEN-DAZ-SDAG: ; %bb.0: +; CODEGEN-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; CODEGEN-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn: +; CODEGEN-DAZ-GISEL: ; %bb.0: +; CODEGEN-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn: +; IR-DAZ-SDAG: ; %bb.0: +; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn: +; IR-DAZ-GISEL: ; %bb.0: +; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-DAZ-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; IR-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call arcp afn float @llvm.sqrt.f32(float %x) %fdiv = fdiv arcp afn float 1.0, %sqrt ret float %fdiv @@ -571,7 +618,8 @@ ; CHECK-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_rsq_f32_e32 v0, v0 +; CHECK-NEXT: v_sqrt_f32_e32 v0, v0 +; CHECK-NEXT: v_rcp_f32_e32 v0, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %sqrt = call float @llvm.sqrt.f32(float %x) %fdiv = fdiv afn float 1.0, %sqrt @@ -579,11 +627,57 @@ } define float @v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only(float %x) { -; CHECK-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_rsq_f32_e32 v0, v0 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only: +; CODEGEN-IEEE-SDAG: ; %bb.0: +; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only: +; CODEGEN-IEEE-GISEL: ; %bb.0: +; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only: +; IR-IEEE-SDAG: ; %bb.0: +; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; IR-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only: +; IR-IEEE-GISEL: ; %bb.0: +; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only: +; CODEGEN-DAZ-SDAG: ; %bb.0: +; CODEGEN-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; CODEGEN-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only: +; CODEGEN-DAZ-GISEL: ; %bb.0: +; CODEGEN-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only: +; IR-DAZ-SDAG: ; %bb.0: +; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only: +; IR-DAZ-GISEL: ; %bb.0: +; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-DAZ-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; IR-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call float @llvm.sqrt.f32(float %x) %fdiv = fdiv arcp afn float 1.0, %sqrt ret float %fdiv @@ -823,7 +917,8 @@ ; CODEGEN-DAZ-LABEL: v_recip_sqrt_f32_ulp25: ; CODEGEN-DAZ: ; %bb.0: ; CODEGEN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CODEGEN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; CODEGEN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-DAZ-NEXT: v_rcp_f32_e32 v0, v0 ; CODEGEN-DAZ-NEXT: s_setpc_b64 s[30:31] ; ; IR-DAZ-LABEL: v_recip_sqrt_f32_ulp25: @@ -928,7 +1023,8 @@ ; CHECK-LABEL: v_recip_sqrt_f32_afn_ulp25: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_rsq_f32_e32 v0, v0 +; CHECK-NEXT: v_sqrt_f32_e32 v0, v0 +; CHECK-NEXT: v_rcp_f32_e32 v0, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %sqrt = call afn float @llvm.sqrt.f32(float %x), !fpmath !0 %fdiv = fdiv afn float 1.0, %sqrt, !fpmath !0 @@ -936,11 +1032,57 @@ } define float @v_recip_sqrt_f32_afn_ulp25_contract(float %x) { -; CHECK-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_rsq_f32_e32 v0, v0 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CODEGEN-IEEE-SDAG-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: +; CODEGEN-IEEE-SDAG: ; %bb.0: +; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-IEEE-GISEL-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: +; CODEGEN-IEEE-GISEL: ; %bb.0: +; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-GISEL-NEXT: v_rsq_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: +; IR-IEEE-SDAG: ; %bb.0: +; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-SDAG-NEXT: v_rcp_f32_e32 v0, v0 +; IR-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-GISEL-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: +; IR-IEEE-GISEL: ; %bb.0: +; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-GISEL-NEXT: v_rsq_f32_e32 v0, v0 +; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-SDAG-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: +; CODEGEN-DAZ-SDAG: ; %bb.0: +; CODEGEN-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-DAZ-SDAG-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-GISEL-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: +; CODEGEN-DAZ-GISEL: ; %bb.0: +; CODEGEN-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-GISEL-NEXT: v_rsq_f32_e32 v0, v0 +; CODEGEN-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-SDAG-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: +; IR-DAZ-SDAG: ; %bb.0: +; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v0, v0 +; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-GISEL-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: +; IR-DAZ-GISEL: ; %bb.0: +; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-GISEL-NEXT: v_rsq_f32_e32 v0, v0 +; IR-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract afn float @llvm.sqrt.f32(float %x), !fpmath !0 %fdiv = fdiv contract afn float 1.0, %sqrt, !fpmath !0 ret float %fdiv Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll @@ -65,7 +65,8 @@ } ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32: -; SI: v_rsq_f32_e32 +; SI: v_sqrt_f32_e32 +; SI: v_rcp_f32_e32 define amdgpu_kernel void @safe_rsq_rcp_pat_f32(ptr addrspace(1) %out, float %src) #1 { %sqrt = call contract float @llvm.sqrt.f32(float %src) %rcp = call contract float @llvm.amdgcn.rcp.f32(float %sqrt) @@ -94,7 +95,8 @@ } ; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f32: -; SI: v_rsq_f32_e32 +; SI: v_sqrt_f32_e32 +; SI: v_rcp_f32_e32 define amdgpu_kernel void @unsafe_rsq_rcp_pat_f32(ptr addrspace(1) %out, float %src) #2 { %sqrt = call float @llvm.sqrt.f32(float %src) %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt) Index: llvm/test/CodeGen/AMDGPU/rsq.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/rsq.f32.ll +++ llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -18,23 +18,23 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { -; GCN-DAZ-LABEL: rsq_f32: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000 -; GCN-DAZ-NEXT: s_mov_b32 s6, -1 -; GCN-DAZ-NEXT: s_mov_b32 s10, s6 -; GCN-DAZ-NEXT: s_mov_b32 s11, s7 -; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-NEXT: s_mov_b32 s8, s2 -; GCN-DAZ-NEXT: s_mov_b32 s9, s3 -; GCN-DAZ-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-DAZ-NEXT: s_mov_b32 s4, s0 -; GCN-DAZ-NEXT: s_mov_b32 s5, s1 -; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-DAZ-NEXT: s_endpgm +; GCN-DAZ-UNSAFE-LABEL: rsq_f32: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-UNSAFE-NEXT: s_endpgm ; ; GCN-IEEE-UNSAFE-LABEL: rsq_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: @@ -54,6 +54,25 @@ ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm ; +; GCN-DAZ-SAFE-LABEL: rsq_f32: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1 +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-DAZ-SAFE-NEXT: s_endpgm +; ; SI-IEEE-SAFE-LABEL: rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 @@ -127,16 +146,16 @@ } define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) { -; GCN-DAZ-LABEL: rsq_f32_sgpr: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-DAZ-NEXT: s_mov_b32 s3, 0xf000 -; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, s2 -; GCN-DAZ-NEXT: s_mov_b32 s2, -1 -; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-DAZ-NEXT: s_endpgm +; GCN-DAZ-UNSAFE-LABEL: rsq_f32_sgpr: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s2, -1 +; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-DAZ-UNSAFE-NEXT: s_endpgm ; ; GCN-IEEE-UNSAFE-LABEL: rsq_f32_sgpr: ; GCN-IEEE-UNSAFE: ; %bb.0: @@ -149,6 +168,18 @@ ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm ; +; GCN-DAZ-SAFE-LABEL: rsq_f32_sgpr: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, s2 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, -1 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-DAZ-SAFE-NEXT: s_endpgm +; ; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb @@ -1168,13 +1199,13 @@ } define { float, float } @v_rsq_f32_multi_use(float %val) { -; GCN-DAZ-LABEL: v_rsq_f32_multi_use: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-NEXT: v_sqrt_f32_e32 v2, v0 -; GCN-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; GCN-DAZ-NEXT: v_mov_b32_e32 v0, v2 -; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_multi_use: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v0 +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v1, v0 +; GCN-DAZ-UNSAFE-NEXT: v_mov_b32_e32 v0, v2 +; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_multi_use: ; GCN-IEEE-UNSAFE: ; %bb.0: @@ -1184,6 +1215,13 @@ ; GCN-IEEE-UNSAFE-NEXT: v_mov_b32_e32 v0, v2 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; +; GCN-DAZ-SAFE-LABEL: v_rsq_f32_multi_use: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v1, v0 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; ; SI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1216,11 +1254,11 @@ } define float @v_rsq_f32_missing_contract0(float %val) { -; GCN-DAZ-LABEL: v_rsq_f32_missing_contract0: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_missing_contract0: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_missing_contract0: ; GCN-IEEE-UNSAFE: ; %bb.0: @@ -1228,6 +1266,13 @@ ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; +; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract0: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; ; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1258,11 +1303,11 @@ } define float @v_rsq_f32_missing_contract1(float %val) { -; GCN-DAZ-LABEL: v_rsq_f32_missing_contract1: -; GCN-DAZ: ; %bb.0: -; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-DAZ-NEXT: s_setpc_b64 s[30:31] +; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_missing_contract1: +; GCN-DAZ-UNSAFE: ; %bb.0: +; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 +; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_missing_contract1: ; GCN-IEEE-UNSAFE: ; %bb.0: @@ -1270,6 +1315,13 @@ ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; +; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract1: +; GCN-DAZ-SAFE: ; %bb.0: +; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31] +; ; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1: ; SI-IEEE-SAFE: ; %bb.0: ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)