Index: llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -316,6 +316,12 @@ return BinaryOp_match(L, R); } +template +inline BinaryOp_match +m_GFDiv(const LHS &L, const RHS &R) { + return BinaryOp_match(L, R); +} + template inline BinaryOp_match m_GFAdd(const LHS &L, const RHS &R) { @@ -464,6 +470,11 @@ return UnaryOp_match(std::forward(Src)); } +template +inline UnaryOp_match m_GFSqrt(const SrcTy &Src) { + return UnaryOp_match(Src); +} + // General helper for generic MI compares, i.e. G_ICMP and G_FCMP // TODO: Allow checking a specific predicate. template Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -26,6 +26,14 @@ [{ return PostLegalizerHelper.matchUCharToFloat(*${itofp}); }]), (apply [{ PostLegalizerHelper.applyUCharToFloat(*${itofp}); }])>; + +def rcp_sqrt_to_rsq : GICombineRule< + (defs root:$rcp, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_INTRINSIC, G_FDIV, G_FSQRT):$rcp, + [{ return PostLegalizerHelper.matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>; + + def cvt_f32_ubyteN_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::CvtF32UByteMatchInfo">; def cvt_f32_ubyteN : GICombineRule< @@ -76,7 +84,8 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, gfx6gfx7_combines, - uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize]> { + uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, + rcp_sqrt_to_rsq]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; let StateClass = "AMDGPUPostLegalizerCombinerHelperState"; let AdditionalArguments = []; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -656,11 +656,6 @@ (RcpInst $src) >; -class RsqPat : AMDGPUPat < - (AMDGPUrcp (fsqrt vt:$src)), - (RsqInst $src) ->; - // Instructions which select to the same v_min_f* def fminnum_like : PatFrags<(ops node:$src0, node:$src1), [(fminnum_ieee node:$src0, node:$src1), Index: llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Target/TargetMachine.h" @@ -56,6 +57,11 @@ bool matchUCharToFloat(MachineInstr &MI); void applyUCharToFloat(MachineInstr &MI); + bool matchRcpSqrtToRsq(MachineInstr &MI, + std::function &MatchInfo); + bool matchSqrtRcpToRsq(MachineInstr &MI, + std::function &MatchInfo); + // FIXME: Should be able to have 2 separate matchdatas rather than custom // struct boilerplate. struct CvtF32UByteMatchInfo { @@ -201,6 +207,51 @@ MI.eraseFromParent(); } +bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq( + MachineInstr &MI, std::function &MatchInfo) { + + auto getRcpSrc = [=](const MachineInstr &MI) { + if (MI.getOpcode() == TargetOpcode::G_INTRINSIC && + MI.getIntrinsicID() == Intrinsic::amdgcn_rcp) + return MRI.getVRegDef(MI.getOperand(2).getReg()); + + MachineInstr *DivSrcMI = nullptr; + mi_match(MI.getOperand(0).getReg(), MRI, + m_GFDiv(m_SpecificICst(1), m_MInstr(DivSrcMI))); + return DivSrcMI; + }; + + auto getSqrtSrc = [=](const MachineInstr &MI) { + MachineInstr *SqrtSrcMI = nullptr; + mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI))); + return SqrtSrcMI; + }; + + MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr; + if ((RcpSrcMI = getRcpSrc(MI)) && + (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) { + MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) { + B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) + .addUse(SqrtSrcMI->getOperand(0).getReg()) + .setMIFlags(MI.getFlags()); + }; + return true; + } + + if ((SqrtSrcMI = getSqrtSrc(MI)) && + (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) { + MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) { + B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) + .addUse(RcpSrcMI->getOperand(0).getReg()) + .setMIFlags(MI.getFlags()); + }; + return true; + } + + return false; +} + + bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { Register SrcReg = MI.getOperand(1).getReg(); Index: llvm/lib/Target/AMDGPU/CaymanInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/CaymanInstructions.td +++ llvm/lib/Target/AMDGPU/CaymanInstructions.td @@ -48,8 +48,6 @@ def COS_cm : COS_Common<0x8E>; } // End isVector = 1 -def : RsqPat; - def : SqrtPat; def : POW_Common ; Index: llvm/lib/Target/AMDGPU/EvergreenInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -126,7 +126,6 @@ def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; -def : RsqPat; def : SqrtPat; def SIN_eg : SIN_Common<0x8D>; Index: llvm/lib/Target/AMDGPU/R600Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/R600Instructions.td +++ llvm/lib/Target/AMDGPU/R600Instructions.td @@ -1275,7 +1275,6 @@ defm DIV_r600 : DIV_Common; def : POW_Common ; - def : RsqPat; def : SqrtPat; def R600_ExportSwz : ExportSwzInst { Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -827,10 +827,6 @@ let OtherPredicates = [UnsafeFPMath] in { -//defm : RsqPat; - -def : RsqPat; - // Convert (x - floor(x)) to fract(x) def : GCNPat < (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir @@ -0,0 +1,86 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: rcp_sqrt_test +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK: $vgpr0 = COPY %3 + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GCN-LABEL: name: rcp_sqrt_test + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GCN: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32) + ; GCN: $vgpr0 = COPY [[INT]](s32) + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $sgpr0 + %2:_(s32) = G_FSQRT %0:_ + %3:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %2:_(s32) + $vgpr0 = COPY %3:_(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 + +... + + +--- +name: div_sqrt_test +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK: $vgpr0 = COPY %3 + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GCN-LABEL: name: div_sqrt_test + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GCN: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32) + ; GCN: $vgpr0 = COPY [[INT]](s32) + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $sgpr0 + %2:_(s32) = G_FSQRT %0:_ + %1:_(s32) = G_CONSTANT i32 1 + %3:_(s32) = afn G_FDIV %1, %2:_(s32) + $vgpr0 = COPY %3:_(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 + +... + +--- +name: sqrt_rcp_test +body: | + bb.0: + liveins: $sgpr0 + + ; GCN-LABEL: name: sqrt_rcp_test + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GCN: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32) + ; GCN: $vgpr0 = COPY [[INT]](s32) + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $sgpr0 + %2:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %0:_(s32) + %3:_(s32) = G_FSQRT %2:_ + $vgpr0 = COPY %3:_(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 + +... + + +--- +name: sqrt_div_test +body: | + bb.0: + liveins: $sgpr0 + + ; GCN-LABEL: name: sqrt_div_test + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GCN: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32) + ; GCN: $vgpr0 = COPY [[INT]](s32) + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_CONSTANT i32 1 + %2:_(s32) = afn G_FDIV %1, %0:_(s32) + %3:_(s32) = G_FSQRT %2:_ + $vgpr0 = COPY %3:_(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 + +...