Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -26,6 +26,13 @@ [{ return PostLegalizerHelper.matchUCharToFloat(*${itofp}); }]), (apply [{ PostLegalizerHelper.applyUCharToFloat(*${itofp}); }])>; + +def rcp_sqrt_to_rsq : GICombineRule< + (defs root:$rcp, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_INTRINSIC):$rcp, + [{ return PostLegalizerHelper.matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>; + def cvt_f32_ubyteN_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::CvtF32UByteMatchInfo">; def cvt_f32_ubyteN : GICombineRule< @@ -76,7 +83,7 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, gfx6gfx7_combines, - uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize]> { + uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, rcp_sqrt_to_rsq]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; let StateClass = "AMDGPUPostLegalizerCombinerHelperState"; let AdditionalArguments = []; Index: llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Target/TargetMachine.h" @@ -56,6 +57,9 @@ bool matchUCharToFloat(MachineInstr &MI); void applyUCharToFloat(MachineInstr &MI); + bool matchRcpSqrtToRsq(MachineInstr &MI, + std::function &MatchInfo); + // FIXME: Should be able to have 2 separate matchdatas rather than custom // struct boilerplate. struct CvtF32UByteMatchInfo { @@ -201,6 +205,22 @@ MI.eraseFromParent(); } +bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq( + MachineInstr &MI, std::function &MatchInfo) { + if (MI.getOpcode() == TargetOpcode::G_INTRINSIC && + MI.getIntrinsicID() == Intrinsic::amdgcn_rcp) { + MachineInstr *SqrtMI = MRI.getVRegDef(MI.getOperand(2).getReg()); + if (SqrtMI->getOpcode() == TargetOpcode::G_FSQRT) { + MatchInfo = [=, &MI](MachineIRBuilder &B) { + B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) + .addUse(SqrtMI->getOperand(1).getReg()); + }; + return true; + } + } + return false; +} + bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { Register SrcReg = MI.getOperand(1).getReg(); Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -827,10 +827,6 @@ let OtherPredicates = [UnsafeFPMath] in { -//defm : RsqPat; - -def : RsqPat; - // Convert (x - floor(x)) to fract(x) def : GCNPat < (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir @@ -0,0 +1,24 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: test +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK: $vgpr0 = COPY %3 + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GCN-LABEL: name: test + ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GCN: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32) + ; GCN: $vgpr0 = COPY [[INT]](s32) + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %0:_(s32) = COPY $sgpr0 + %2:_(s32) = G_FSQRT %0:_ + %3:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %2:_(s32) + $vgpr0 = COPY %3:_(s32) + SI_RETURN_TO_EPILOG implicit $vgpr0 + +... +