Index: llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -489,6 +489,11 @@ return UnaryOp_match(std::forward(Src)); } +template +inline UnaryOp_match m_GFSqrt(const SrcTy &Src) { + return UnaryOp_match(Src); +} + // General helper for generic MI compares, i.e. G_ICMP and G_FCMP // TODO: Allow checking a specific predicate. template Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -26,6 +26,14 @@ [{ return PostLegalizerHelper.matchUCharToFloat(*${itofp}); }]), (apply [{ PostLegalizerHelper.applyUCharToFloat(*${itofp}); }])>; + +def rcp_sqrt_to_rsq : GICombineRule< + (defs root:$rcp, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_INTRINSIC, G_FSQRT):$rcp, + [{ return PostLegalizerHelper.matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>; + + def cvt_f32_ubyteN_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::CvtF32UByteMatchInfo">; def cvt_f32_ubyteN : GICombineRule< @@ -86,7 +94,8 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, gfx6gfx7_combines, - uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg]> { + uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, + rcp_sqrt_to_rsq]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; let StateClass = "AMDGPUPostLegalizerCombinerHelperState"; let AdditionalArguments = []; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -713,11 +713,6 @@ (RcpInst $src) >; -class RsqPat : AMDGPUPat < - (AMDGPUrcp (fsqrt vt:$src)), - (RsqInst $src) ->; - // Instructions which select to the same v_min_f* def fminnum_like : PatFrags<(ops node:$src0, node:$src1), [(fminnum_ieee node:$src0, node:$src1), Index: llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" @@ -58,6 +59,9 @@ bool matchUCharToFloat(MachineInstr &MI); void applyUCharToFloat(MachineInstr &MI); + bool matchRcpSqrtToRsq(MachineInstr &MI, + std::function &MatchInfo); + // FIXME: Should be able to have 2 separate matchdatas rather than custom // struct boilerplate. struct CvtF32UByteMatchInfo { @@ -203,6 +207,48 @@ MI.eraseFromParent(); } +bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq( + MachineInstr &MI, std::function &MatchInfo) { + + auto getRcpSrc = [=](const MachineInstr &MI) { + MachineInstr *ResMI = nullptr; + if (MI.getOpcode() == TargetOpcode::G_INTRINSIC && + MI.getIntrinsicID() == Intrinsic::amdgcn_rcp) + ResMI = MRI.getVRegDef(MI.getOperand(2).getReg()); + + return ResMI; + }; + + auto getSqrtSrc = [=](const MachineInstr &MI) { + MachineInstr *SqrtSrcMI = nullptr; + mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI))); + return SqrtSrcMI; + }; + + MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr; + // rcp(sqrt(x)) + if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) { + MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) { + B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) + .addUse(SqrtSrcMI->getOperand(0).getReg()) + .setMIFlags(MI.getFlags()); + }; + return true; + } + + // sqrt(rcp(x)) + if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) { + MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) { + B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) + .addUse(RcpSrcMI->getOperand(0).getReg()) + .setMIFlags(MI.getFlags()); + }; + return true; + } + + return false; +} + bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { Register SrcReg = MI.getOperand(1).getReg(); Index: llvm/lib/Target/AMDGPU/CaymanInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/CaymanInstructions.td +++ llvm/lib/Target/AMDGPU/CaymanInstructions.td @@ -48,8 +48,6 @@ def COS_cm : COS_Common<0x8E>; } // End isVector = 1 -def : RsqPat; - def : SqrtPat; def : POW_Common ; Index: llvm/lib/Target/AMDGPU/EvergreenInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -126,7 +126,6 @@ def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; -def : RsqPat; def : SqrtPat; def SIN_eg : SIN_Common<0x8D>; Index: llvm/lib/Target/AMDGPU/R600Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/R600Instructions.td +++ llvm/lib/Target/AMDGPU/R600Instructions.td @@ -1265,7 +1265,6 @@ defm DIV_r600 : DIV_Common; def : POW_Common ; - def : RsqPat; def : SqrtPat; def R600_ExportSwz : ExportSwzInst { Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -827,10 +827,6 @@ let OtherPredicates = [UnsafeFPMath] in { -//defm : RsqPat; - -def : RsqPat; - // Convert (x - floor(x)) to fract(x) def : GCNPat < (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +define amdgpu_cs float @div_sqrt(float inreg %arg1) { +; GCN-LABEL: div_sqrt: +; GCN: ; %bb.0: ; %.entry +; GCN-NEXT: v_rsq_f32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +.entry: + %a = call float @llvm.sqrt.f32(float %arg1) + %b = fdiv afn float 1.000000e+00, %a + ret float %b +} + +define amdgpu_cs float @sqrt_div(float inreg %arg1) { +; GCN-LABEL: sqrt_div: +; GCN: ; %bb.0: ; %.entry +; GCN-NEXT: v_rsq_f32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +.entry: + %a = fdiv afn float 1.000000e+00, %arg1 + %b = call float @llvm.sqrt.f32(float %a) + ret float %b +} + +define amdgpu_cs float @rcp_sqrt(float inreg %arg1) { +; GCN-LABEL: rcp_sqrt: +; GCN: ; %bb.0: ; %.entry +; GCN-NEXT: v_rsq_f32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +.entry: + %a = call float @llvm.sqrt.f32(float %arg1) + %b = call float @llvm.amdgcn.rcp.f32(float %a) + ret float %b +} + +define amdgpu_cs float @sqrt_rcp(float inreg %arg1) { +; GCN-LABEL: sqrt_rcp: +; GCN: ; %bb.0: ; %.entry +; GCN-NEXT: v_rsq_f32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +.entry: + %a = call float @llvm.amdgcn.rcp.f32(float %arg1) + %b = call float @llvm.sqrt.f32(float %a) + ret float %b +} + + +declare float @llvm.sqrt.f32(float) +declare float @llvm.amdgcn.rcp.f32(float) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir @@ -0,0 +1,42 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: rcp_sqrt_test +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK: $vgpr0 = COPY %3 + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GCN-LABEL: name: rcp_sqrt_test + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GCN: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32) + ; GCN: $vgpr0 = COPY [[INT]](s32) + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $sgpr0 + %2:_(s32) = G_FSQRT %0:_ + %3:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %2:_(s32) + $vgpr0 = COPY %3:_(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 + +... + +--- +name: sqrt_rcp_test +body: | + bb.0: + liveins: $sgpr0 + + ; GCN-LABEL: name: sqrt_rcp_test + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GCN: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32) + ; GCN: $vgpr0 = COPY [[INT]](s32) + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $sgpr0 + %2:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %0:_(s32) + %3:_(s32) = G_FSQRT %2:_ + $vgpr0 = COPY %3:_(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 + +...