diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -136,6 +136,10 @@ void replaceRegOpWith(MachineRegisterInfo &MRI, MachineOperand &FromRegOp, Register ToReg) const; + /// Replace the opcode in instruction with a new opcode and inform the + /// observer of the changes. + void replaceOpcodeWith(MachineInstr &FromMI, unsigned ToOpcode) const; + /// Get the register bank of \p Reg. /// If Reg has not been assigned a register, a register class, /// or a register bank, then this returns nullptr. diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -158,6 +158,15 @@ Observer.changedInstr(*FromRegOp.getParent()); } +void CombinerHelper::replaceOpcodeWith(MachineInstr &FromMI, + unsigned ToOpcode) const { + Observer.changingInstr(FromMI); + + FromMI.setDesc(Builder.getTII().get(ToOpcode)); + + Observer.changedInstr(FromMI); +} + const RegisterBank *CombinerHelper::getRegBank(Register Reg) const { return RBI->getRegBank(Reg, MRI, *TRI); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -64,19 +64,29 @@ [{ return PostLegalizerHelper.matchRemoveFcanonicalize(*${fcanonicalize}, ${matchinfo}); }]), (apply [{ Helper.replaceSingleDefInstWithReg(*${fcanonicalize}, ${matchinfo}); }])>; +def foldable_fneg_matchdata : GIDefMatchData<"MachineInstr *">; + +def foldable_fneg : GICombineRule< + (defs root:$ffn, foldable_fneg_matchdata:$matchinfo), + (match (wip_match_opcode G_FNEG):$ffn, + [{ return Helper.matchFoldableFneg(*${ffn}, ${matchinfo}); }]), + (apply [{ Helper.applyFoldableFneg(*${ffn}, ${matchinfo}); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< - "AMDGPUGenPreLegalizerCombinerHelper", [all_combines, clamp_i64_to_i16]> { + "AMDGPUGenPreLegalizerCombinerHelper", + [all_combines, clamp_i64_to_i16, foldable_fneg]> { let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule"; let StateClass = "AMDGPUPreLegalizerCombinerHelperState"; + let AdditionalArguments = []; } def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, gfx6gfx7_combines, - uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize]> { + uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; let StateClass = "AMDGPUPostLegalizerCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h @@ -0,0 +1,26 @@ +//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.h -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This contains common combine transformations that may be used in a combine +/// pass. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" + +using namespace llvm; + +class AMDGPUCombinerHelper : public CombinerHelper { +public: + using CombinerHelper::CombinerHelper; + + bool matchFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo); + void applyFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo); +}; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp @@ -0,0 +1,382 @@ +//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUCombinerHelper.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; +using namespace MIPatternMatch; + +LLVM_READNONE +static bool fnegFoldsIntoMI(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::G_FADD: + case AMDGPU::G_FSUB: + case AMDGPU::G_FMUL: + case AMDGPU::G_FMA: + case AMDGPU::G_FMAD: + case AMDGPU::G_FMINNUM: + case AMDGPU::G_FMAXNUM: + case AMDGPU::G_FMINNUM_IEEE: + case AMDGPU::G_FMAXNUM_IEEE: + case AMDGPU::G_FSIN: + case AMDGPU::G_FPEXT: + case AMDGPU::G_INTRINSIC_TRUNC: + case AMDGPU::G_FPTRUNC: + case AMDGPU::G_FRINT: + case AMDGPU::G_FNEARBYINT: + case AMDGPU::G_INTRINSIC_ROUND: + case AMDGPU::G_INTRINSIC_ROUNDEVEN: + case AMDGPU::G_FCANONICALIZE: + case AMDGPU::G_AMDGPU_RCP_IFLAG: + case AMDGPU::G_AMDGPU_FMIN_LEGACY: + case AMDGPU::G_AMDGPU_FMAX_LEGACY: + return true; + case AMDGPU::G_INTRINSIC: { + unsigned IntrinsicID = MI.getIntrinsicID(); + switch (IntrinsicID) { + case Intrinsic::amdgcn_rcp: + case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_sin: + case Intrinsic::amdgcn_fmul_legacy: + case Intrinsic::amdgcn_fmed3: + case Intrinsic::amdgcn_fma_legacy: + return true; + default: + return false; + } + } + default: + return false; + } +} + +/// \p returns true if the operation will definitely need to use a 64-bit +/// encoding, and thus will use a VOP3 encoding regardless of the source +/// modifiers. +LLVM_READONLY +static bool opMustUseVOP3Encoding(const MachineInstr &MI, + const MachineRegisterInfo &MRI) { + return MI.getNumOperands() > + (MI.getOpcode() == AMDGPU::G_INTRINSIC ? 4 : 3) || + MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64; +} + +// Most FP instructions support source modifiers. +LLVM_READONLY +static bool hasSourceMods(const MachineInstr &MI) { + if (!MI.memoperands().empty()) + return false; + + switch (MI.getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::G_SELECT: + case AMDGPU::G_FDIV: + case AMDGPU::G_FREM: + case TargetOpcode::INLINEASM: + case TargetOpcode::INLINEASM_BR: + case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: + case AMDGPU::G_BITCAST: + case AMDGPU::G_ANYEXT: + case AMDGPU::G_BUILD_VECTOR: + case AMDGPU::G_BUILD_VECTOR_TRUNC: + case AMDGPU::G_PHI: + return false; + case AMDGPU::G_INTRINSIC: { + unsigned IntrinsicID = MI.getIntrinsicID(); + switch (IntrinsicID) { + case Intrinsic::amdgcn_interp_p1: + case Intrinsic::amdgcn_interp_p2: + case Intrinsic::amdgcn_interp_mov: + case Intrinsic::amdgcn_interp_p1_f16: + case Intrinsic::amdgcn_interp_p2_f16: + case Intrinsic::amdgcn_div_scale: + return false; + default: + return true; + } + } + default: + return true; + } +} + +static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, + unsigned CostThreshold = 4) { + // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus + // it is truly free to use a source modifier in all cases. If there are + // multiple users but for each one will necessitate using VOP3, there will be + // a code size increase. Try to avoid increasing code size unless we know it + // will save on the instruction count. + unsigned NumMayIncreaseSize = 0; + Register Dst = MI.getOperand(0).getReg(); + for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) { + if (!hasSourceMods(Use)) + return false; + + if (!opMustUseVOP3Encoding(Use, MRI)) { + if (++NumMayIncreaseSize > CostThreshold) + return false; + } + } + return true; +} + +static bool mayIgnoreSignedZero(MachineInstr &MI) { + const TargetOptions &Options = MI.getMF()->getTarget().Options; + return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz); +} + +static bool isInv2Pi(const APFloat &APF) { + static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118)); + static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983)); + static const APFloat KF64(APFloat::IEEEdouble(), + APInt(64, 0x3fc45f306dc9c882)); + + return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) || + APF.bitwiseIsEqual(KF64); +} + +// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an +// additional cost to negate them. +static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg, + MachineRegisterInfo &MRI) { + Optional FPValReg; + if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) { + if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative()) + return true; + + const GCNSubtarget &ST = MI.getMF()->getSubtarget(); + if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value)) + return true; + } + return false; +} + +static unsigned inverseMinMax(unsigned Opc) { + switch (Opc) { + case AMDGPU::G_FMAXNUM: + return AMDGPU::G_FMINNUM; + case AMDGPU::G_FMINNUM: + return AMDGPU::G_FMAXNUM; + case AMDGPU::G_FMAXNUM_IEEE: + return AMDGPU::G_FMINNUM_IEEE; + case AMDGPU::G_FMINNUM_IEEE: + return AMDGPU::G_FMAXNUM_IEEE; + case AMDGPU::G_AMDGPU_FMAX_LEGACY: + return AMDGPU::G_AMDGPU_FMIN_LEGACY; + case AMDGPU::G_AMDGPU_FMIN_LEGACY: + return AMDGPU::G_AMDGPU_FMAX_LEGACY; + default: + llvm_unreachable("invalid min/max opcode"); + } +} + +bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI, + MachineInstr *&MatchInfo) { + Register Src = MI.getOperand(1).getReg(); + MatchInfo = MRI.getVRegDef(Src); + + // If the input has multiple uses and we can either fold the negate down, or + // the other uses cannot, give up. This both prevents unprofitable + // transformations and infinite loops: we won't repeatedly try to fold around + // a negate that has no 'good' form. + if (MRI.hasOneNonDBGUse(Src)) { + if (allUsesHaveSourceMods(MI, MRI, 0)) + return false; + } else { + if (fnegFoldsIntoMI(*MatchInfo) && + (allUsesHaveSourceMods(MI, MRI) || + !allUsesHaveSourceMods(*MatchInfo, MRI))) + return false; + } + + switch (MatchInfo->getOpcode()) { + case AMDGPU::G_FMINNUM: + case AMDGPU::G_FMAXNUM: + case AMDGPU::G_FMINNUM_IEEE: + case AMDGPU::G_FMAXNUM_IEEE: + case AMDGPU::G_AMDGPU_FMIN_LEGACY: + case AMDGPU::G_AMDGPU_FMAX_LEGACY: + // 0 doesn't have a negated inline immediate. + return !isConstantCostlierToNegate(*MatchInfo, + MatchInfo->getOperand(2).getReg(), MRI); + case AMDGPU::G_FADD: + case AMDGPU::G_FSUB: + case AMDGPU::G_FMA: + case AMDGPU::G_FMAD: + return mayIgnoreSignedZero(*MatchInfo); + case AMDGPU::G_FMUL: + case AMDGPU::G_FPEXT: + case AMDGPU::G_INTRINSIC_TRUNC: + case AMDGPU::G_FPTRUNC: + case AMDGPU::G_FRINT: + case AMDGPU::G_FNEARBYINT: + case AMDGPU::G_INTRINSIC_ROUND: + case AMDGPU::G_INTRINSIC_ROUNDEVEN: + case AMDGPU::G_FSIN: + case AMDGPU::G_FCANONICALIZE: + case AMDGPU::G_AMDGPU_RCP_IFLAG: + return true; + case AMDGPU::G_INTRINSIC: { + unsigned IntrinsicID = MatchInfo->getIntrinsicID(); + switch (IntrinsicID) { + case Intrinsic::amdgcn_rcp: + case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_sin: + case Intrinsic::amdgcn_fmul_legacy: + case Intrinsic::amdgcn_fmed3: + return true; + case Intrinsic::amdgcn_fma_legacy: + return mayIgnoreSignedZero(*MatchInfo); + default: + return false; + } + } + default: + return false; + } +} + +void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI, + MachineInstr *&MatchInfo) { + // Transform: + // %A = inst %Op1, ... + // %B = fneg %A + // + // into: + // + // (if %A has one use, specifically fneg above) + // %B = inst (maybe fneg %Op1), ... + // + // (if %A has multiple uses) + // %B = inst (maybe fneg %Op1), ... + // %A = fneg %B + + // Replace register in operand with a register holding negated value. + auto NegateOperand = [&](MachineOperand &Op) { + Register Reg = Op.getReg(); + if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg)))) + Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0); + replaceRegOpWith(MRI, Op, Reg); + }; + + // Replace either register in operands with a register holding negated value. + auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) { + Register XReg = X.getReg(); + Register YReg = Y.getReg(); + if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg)))) + replaceRegOpWith(MRI, X, XReg); + else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg)))) + replaceRegOpWith(MRI, Y, YReg); + else { + YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0); + replaceRegOpWith(MRI, Y, YReg); + } + }; + + Builder.setInstrAndDebugLoc(*MatchInfo); + + // Negate appropriate operands so that resulting value of MatchInfo is + // negated. + switch (MatchInfo->getOpcode()) { + case AMDGPU::G_FADD: + case AMDGPU::G_FSUB: + NegateOperand(MatchInfo->getOperand(1)); + NegateOperand(MatchInfo->getOperand(2)); + break; + case AMDGPU::G_FMUL: + NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2)); + break; + case AMDGPU::G_FMINNUM: + case AMDGPU::G_FMAXNUM: + case AMDGPU::G_FMINNUM_IEEE: + case AMDGPU::G_FMAXNUM_IEEE: + case AMDGPU::G_AMDGPU_FMIN_LEGACY: + case AMDGPU::G_AMDGPU_FMAX_LEGACY: { + NegateOperand(MatchInfo->getOperand(1)); + NegateOperand(MatchInfo->getOperand(2)); + unsigned Opposite = inverseMinMax(MatchInfo->getOpcode()); + replaceOpcodeWith(*MatchInfo, Opposite); + break; + } + case AMDGPU::G_FMA: + case AMDGPU::G_FMAD: + NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2)); + NegateOperand(MatchInfo->getOperand(3)); + break; + case AMDGPU::G_FPEXT: + case AMDGPU::G_INTRINSIC_TRUNC: + case AMDGPU::G_FRINT: + case AMDGPU::G_FNEARBYINT: + case AMDGPU::G_INTRINSIC_ROUND: + case AMDGPU::G_INTRINSIC_ROUNDEVEN: + case AMDGPU::G_FSIN: + case AMDGPU::G_FCANONICALIZE: + case AMDGPU::G_AMDGPU_RCP_IFLAG: + case AMDGPU::G_FPTRUNC: + NegateOperand(MatchInfo->getOperand(1)); + break; + case AMDGPU::G_INTRINSIC: { + unsigned IntrinsicID = MatchInfo->getIntrinsicID(); + switch (IntrinsicID) { + case Intrinsic::amdgcn_rcp: + case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_sin: + NegateOperand(MatchInfo->getOperand(2)); + break; + case Intrinsic::amdgcn_fmul_legacy: + NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3)); + break; + case Intrinsic::amdgcn_fmed3: + NegateOperand(MatchInfo->getOperand(2)); + NegateOperand(MatchInfo->getOperand(3)); + NegateOperand(MatchInfo->getOperand(4)); + break; + case Intrinsic::amdgcn_fma_legacy: + NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3)); + NegateOperand(MatchInfo->getOperand(4)); + break; + default: + llvm_unreachable("folding fneg not supported for this intrinsic"); + } + break; + } + default: + llvm_unreachable("folding fneg not supported for this instruction"); + } + + Register Dst = MI.getOperand(0).getReg(); + Register MatchInfoDst = MatchInfo->getOperand(0).getReg(); + + if (MRI.hasOneNonDBGUse(MatchInfoDst)) { + // MatchInfo now has negated value so use that instead of old Dst. + replaceRegWith(MRI, Dst, MatchInfoDst); + } else { + // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa + // but replaceRegWith will replace defs as well. It is easier to replace one + // def with a new register. + LLT Type = MRI.getType(Dst); + Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type); + replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo); + + // MatchInfo now has negated value so use that instead of old Dst. + replaceRegWith(MRI, Dst, NegatedMatchInfo); + + // Recreate non negated value for other uses of old MatchInfoDst + Builder.setInstrAndDebugLoc(MI); + Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags()); + } + + MI.eraseFromParent(); + return; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUCombinerHelper.h" #include "AMDGPULegalizerInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -34,10 +35,11 @@ MachineIRBuilder &B; MachineFunction &MF; MachineRegisterInfo &MRI; - CombinerHelper &Helper; + AMDGPUCombinerHelper &Helper; public: - AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) + AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, + AMDGPUCombinerHelper &Helper) : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; struct FMinFMaxLegacyInfo { @@ -257,12 +259,12 @@ class AMDGPUPostLegalizerCombinerHelperState { protected: - CombinerHelper &Helper; + AMDGPUCombinerHelper &Helper; AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper; public: AMDGPUPostLegalizerCombinerHelperState( - CombinerHelper &Helper, + AMDGPUCombinerHelper &Helper, AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper) : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {} }; @@ -300,7 +302,7 @@ bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const { - CombinerHelper Helper(Observer, B, KB, MDT, LInfo); + AMDGPUCombinerHelper Helper(Observer, B, KB, MDT, LInfo); AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper); AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, PostLegalizerHelper); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUCombinerHelper.h" #include "AMDGPULegalizerInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -34,10 +35,11 @@ MachineIRBuilder &B; MachineFunction &MF; MachineRegisterInfo &MRI; - CombinerHelper &Helper; + AMDGPUCombinerHelper &Helper; public: - AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) + AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, + AMDGPUCombinerHelper &Helper) : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; struct ClampI64ToI16MatchInfo { @@ -154,12 +156,12 @@ class AMDGPUPreLegalizerCombinerHelperState { protected: - CombinerHelper &Helper; + AMDGPUCombinerHelper &Helper; AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper; public: AMDGPUPreLegalizerCombinerHelperState( - CombinerHelper &Helper, + AMDGPUCombinerHelper &Helper, AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper) : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {} }; @@ -196,12 +198,12 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const { - CombinerHelper Helper(Observer, B, KB, MDT); + AMDGPUCombinerHelper Helper(Observer, B, KB, MDT); AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper); AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, PreLegalizerHelper); - if (Generated.tryCombineAll(Observer, MI, B, Helper)) + if (Generated.tryCombineAll(Observer, MI, B)) return true; switch (MI.getOpcode()) { diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -51,6 +51,7 @@ AMDGPUAttributor.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp + AMDGPUCombinerHelper.cpp AMDGPUCtorDtorLowering.cpp AMDGPUExportClustering.cpp AMDGPUFixFunctionBitcasts.cpp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-foldable-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-foldable-fneg.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-foldable-fneg.mir @@ -0,0 +1,779 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: test_fminnum +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_fminnum + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[COPY1]] + ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:_(s32) = G_FMAXNUM [[FNEG]], [[FNEG1]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMAXNUM]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FMINNUM %0, %1 + %3:_(s32) = G_FNEG %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_fmaxnum +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_fmaxnum + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[COPY1]] + ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:_(s32) = G_FMINNUM [[FNEG]], [[FNEG1]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FMAXNUM %0, %1 + %3:_(s32) = G_FNEG %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_fminnum_ieee +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_fminnum_ieee + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[COPY1]] + ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FNEG]], [[FNEG1]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FMINNUM_IEEE %0, %1 + %3:_(s32) = G_FNEG %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_fmaxnum_ieee +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_fmaxnum_ieee + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[COPY1]] + ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FNEG]], [[FNEG1]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FMAXNUM_IEEE %0, %1 + %3:_(s32) = G_FNEG %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_amdgpu_fmin_legacy +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_amdgpu_fmin_legacy + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[COPY1]] + ; CHECK-NEXT: [[AMDGPU_FMAX_LEGACY:%[0-9]+]]:_(s32) = G_AMDGPU_FMAX_LEGACY [[FNEG]], [[FNEG1]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMAX_LEGACY]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_AMDGPU_FMIN_LEGACY %0, %1 + %3:_(s32) = G_FNEG %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_amdgpu_fmax_legacy +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_amdgpu_fmax_legacy + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[COPY1]] + ; CHECK-NEXT: [[AMDGPU_FMIN_LEGACY:%[0-9]+]]:_(s32) = G_AMDGPU_FMIN_LEGACY [[FNEG]], [[FNEG1]] + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMIN_LEGACY]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_AMDGPU_FMAX_LEGACY %0, %1 + %3:_(s32) = G_FNEG %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_fadd +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_fadd + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nsz G_FSUB [[FNEG]], [[COPY1]] + ; CHECK-NEXT: $vgpr0 = COPY [[FSUB]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = nsz G_FADD %0, %1 + %3:_(s32) = G_FNEG %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_fsub +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_fsub + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nsz G_FSUB [[COPY1]], [[COPY]] + ; CHECK-NEXT: $vgpr0 = COPY [[FSUB]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = nsz G_FSUB %0, %1 + %3:_(s32) = G_FNEG %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_fma +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_fma + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY1]] + ; CHECK-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[COPY2]] + ; CHECK-NEXT: [[FMA:%[0-9]+]]:_(s32) = nsz G_FMA [[COPY]], [[FNEG]], [[FNEG1]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMA]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = nsz G_FMA %0, %1, %2 + %4:_(s32) = G_FNEG %3 + $vgpr0 = COPY %4(s32) + +... +--- +name: test_fmad +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_fmad + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY1]] + ; CHECK-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[COPY2]] + ; CHECK-NEXT: [[FMAD:%[0-9]+]]:_(s32) = nsz G_FMAD [[COPY]], [[FNEG]], [[FNEG1]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMAD]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = nsz G_FMAD %0, %1, %2 + %4:_(s32) = G_FNEG %3 + $vgpr0 = COPY %4(s32) + +... +--- +name: test_fmul +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_fmul + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY1]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[FNEG]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMUL]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FMUL %0, %1 + %3:_(s32) = G_FNEG %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_fpext +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fpext + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]] + ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[FNEG]](s16) + ; CHECK-NEXT: $vgpr0 = COPY [[FPEXT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s16) = G_TRUNC %0(s32) + %2:_(s32) = G_FPEXT %1(s16) + %3:_(s32) = G_FNEG %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_intrinsic_trunc +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_intrinsic_trunc + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[FNEG]] + ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_TRUNC]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_INTRINSIC_TRUNC %0 + %2:_(s32) = G_FNEG %1 + $vgpr0 = COPY %2(s32) + +... +--- +name: test_frint +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_frint + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[FRINT:%[0-9]+]]:_(s32) = G_FRINT [[FNEG]] + ; CHECK-NEXT: $vgpr0 = COPY [[FRINT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FRINT %0 + %2:_(s32) = G_FNEG %1 + $vgpr0 = COPY %2(s32) + +... +--- +name: test_fnearbyint +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fnearbyint + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[FNEARBYINT:%[0-9]+]]:_(s32) = G_FNEARBYINT [[FNEG]] + ; CHECK-NEXT: $vgpr0 = COPY [[FNEARBYINT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FNEARBYINT %0 + %2:_(s32) = G_FNEG %1 + $vgpr0 = COPY %2(s32) + +... +--- +name: test_intrinsic_round +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_intrinsic_round + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[INTRINSIC_ROUND:%[0-9]+]]:_(s32) = G_INTRINSIC_ROUND [[FNEG]] + ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_ROUND]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_INTRINSIC_ROUND %0 + %2:_(s32) = G_FNEG %1 + $vgpr0 = COPY %2(s32) + +... +--- +name: test_intrinsic_roundeven +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_intrinsic_roundeven + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[INTRINSIC_ROUNDEVEN:%[0-9]+]]:_(s32) = G_INTRINSIC_ROUNDEVEN [[FNEG]] + ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_ROUNDEVEN]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_INTRINSIC_ROUNDEVEN %0 + %2:_(s32) = G_FNEG %1 + $vgpr0 = COPY %2(s32) + +... +--- +name: test_fsin +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fsin + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[FSIN:%[0-9]+]]:_(s32) = G_FSIN [[FNEG]] + ; CHECK-NEXT: $vgpr0 = COPY [[FSIN]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FSIN %0 + %2:_(s32) = G_FNEG %1 + $vgpr0 = COPY %2(s32) + +... +--- +name: test_fcanonicalize +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fcanonicalize + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: $vgpr0 = COPY [[FCANONICALIZE]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FCANONICALIZE %0 + %2:_(s32) = G_FNEG %1 + $vgpr0 = COPY %2(s32) + +... +--- +name: test_amdgcn_rcp_iflag +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_amdgcn_rcp_iflag + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[FNEG]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_RCP_IFLAG]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_AMDGPU_RCP_IFLAG %0 + %2:_(s32) = G_FNEG %1 + $vgpr0 = COPY %2(s32) + +... +--- +name: test_fptrunc +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: test_fptrunc + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s32) = G_FPTRUNC [[FNEG]](s64) + ; CHECK-NEXT: $vgpr0 = COPY [[FPTRUNC]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_FPTRUNC %0:_(s64) + %2:_(s32) = G_FNEG %1:_ + $vgpr0 = COPY %2:_(s32) + +... +--- +name: test_amdgcn_rcp +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_amdgcn_rcp + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %0(s32) + %2:_(s32) = G_FNEG %1 + $vgpr0 = COPY %2(s32) + +... +--- +name: test_amdgcn_rcp_legacy +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_amdgcn_rcp_legacy + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp.legacy), [[FNEG]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp.legacy), %0(s32) + %2:_(s32) = G_FNEG %1 + $vgpr0 = COPY %2(s32) + +... +--- +name: test_amdgcn_sin +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_amdgcn_sin + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sin), [[FNEG]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sin), %0(s32) + %2:_(s32) = G_FNEG %1 + $vgpr0 = COPY %2(s32) + +... +--- +name: test_fmul_legacy +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_fmul_legacy + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY1]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[COPY]](s32), [[FNEG]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), %0(s32), %1(s32) + %3:_(s32) = G_FNEG %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_fmed3 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_fmed3 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[COPY1]] + ; CHECK-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[COPY2]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), [[FNEG]](s32), [[FNEG1]](s32), [[FNEG2]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), %0(s32), %1(s32), %2(s32) + %4:_(s32) = G_FNEG %3 + $vgpr0 = COPY %4(s32) + +... +--- +name: test_amdgcn_fma_legacy +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_amdgcn_fma_legacy + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY1]] + ; CHECK-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[COPY2]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fma.legacy), [[COPY]](s32), [[FNEG]](s32), [[FNEG1]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fma.legacy), %0(s32), %1(s32), %2(s32) + %4:_(s32) = G_FNEG %3 + $vgpr0 = COPY %4(s32) + +... + +# Don't fold fneg for fadd, fsub, fma, fmad or fma_legacy without nsz +--- +name: test_fadd_sz +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_fadd_sz + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FADD]] + ; CHECK-NEXT: $vgpr0 = COPY [[FNEG]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FADD %0, %1 + %3:_(s32) = G_FNEG %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_fsub_sz +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_fsub_sz + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FSUB]] + ; CHECK-NEXT: $vgpr0 = COPY [[FNEG]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FSUB %0, %1 + %3:_(s32) = G_FNEG %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_fma_sz +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_fma_sz + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FMA]] + ; CHECK-NEXT: $vgpr0 = COPY [[FNEG]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = G_FMA %0, %1, %2 + %4:_(s32) = G_FNEG %3 + $vgpr0 = COPY %4(s32) + +... +--- +name: test_fmad_sz +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_fmad_sz + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[FMAD:%[0-9]+]]:_(s32) = G_FMAD [[COPY]], [[COPY1]], [[COPY2]] + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FMAD]] + ; CHECK-NEXT: $vgpr0 = COPY [[FNEG]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = G_FMAD %0, %1, %2 + %4:_(s32) = G_FNEG %3 + $vgpr0 = COPY %4(s32) + +... +--- +name: test_amdgcn_fma_legacy_sz +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_amdgcn_fma_legacy_sz + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fma.legacy), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[INT]] + ; CHECK-NEXT: $vgpr0 = COPY [[FNEG]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fma.legacy), %0(s32), %1(s32), %2(s32) + %4:_(s32) = G_FNEG %3 + $vgpr0 = COPY %4(s32) + +... + +# Don't negate 0 for minnum, maxnum +--- +name: test_fminnum_zero +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: test_fminnum_zero + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:_(s32) = G_FMINNUM [[COPY]], [[C]] + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FMINNUM]] + ; CHECK-NEXT: $vgpr0 = COPY [[FNEG]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FCONSTANT float 0.000000e+00 + %2:_(s32) = G_FMINNUM %0:_, %1:_ + %3:_(s32) = G_FNEG %2:_ + $vgpr0 = COPY %3:_(s32) + +... + +# On VI and above don't negate 1.0 / (0.5 * pi) +--- +name: test_fminnum_inv2pi_half +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: test_fminnum_inv2pi_half + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3118 + ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:_(s16) = G_FMINNUM [[TRUNC]], [[C]] + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[FMINNUM]] + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FNEG]](s16) + ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s16) = G_TRUNC %0:_(s32) + %2:_(s16) = G_FCONSTANT half 0xH3118 + %3:_(s16) = G_FMINNUM %1:_, %2:_ + %4:_(s16) = G_FNEG %3:_ + %5:_(s32) = G_ANYEXT %4:_(s16) + $vgpr0 = COPY %5:_(s32) + +... +--- +name: test_fminnum_inv2pi_float +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: test_fminnum_inv2pi_float + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x3FC45F3060000000 + ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:_(s32) = G_FMINNUM [[COPY]], [[C]] + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FMINNUM]] + ; CHECK-NEXT: $vgpr0 = COPY [[FNEG]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FCONSTANT float 0x3FC45F3060000000 + %2:_(s32) = G_FMINNUM %0:_, %1:_ + %3:_(s32) = G_FNEG %2:_ + $vgpr0 = COPY %3:_(s32) + +... +--- +name: test_fminnum_inv2pi_double +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: test_fminnum_inv2pi_double + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FC45F306DC9C882 + ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:_(s64) = G_FMINNUM [[COPY]], [[C]] + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMINNUM]] + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[FNEG]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = G_FCONSTANT double 0x3FC45F306DC9C882 + %2:_(s64) = G_FMINNUM %0:_, %1:_ + %3:_(s64) = G_FNEG %2:_ + $vgpr0_vgpr1 = COPY %3:_(s64) + +... + +#Don't fold when where instruction count will not decrease. +--- +name: test_use_both +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_use_both + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FMUL]] + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[COPY2]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[FNEG]](s32) + ; CHECK-NEXT: $vgpr2 = COPY [[FMUL1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = G_FMUL %0, %1 + %4:_(s32) = G_FNEG %3 + %5:_(s32) = G_FMUL %4, %2 + $vgpr0 = COPY %3:_(s32) + $vgpr1 = COPY %4:_(s32) + $vgpr2 = COPY %5:_(s32) + +... + +#Don't fold when where instruction count will not decrease. +--- +name: test_use_both2 +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_use_both2 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FMUL]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[FNEG]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FMUL %0, %1 + %3:_(s32) = G_FNEG %2 + $vgpr0 = COPY %2:_(s32) + $vgpr1 = COPY %3:_(s32) + +... + +--- +name: multiple_uses_of_fneg +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: multiple_uses_of_fneg + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY1]] + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[FNEG]] + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[COPY2]] + ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[COPY3]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[FMUL1]](s32) + ; CHECK-NEXT: $vgpr2 = COPY [[FMUL2]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = COPY $vgpr3 + + %4:_(s32) = G_FMUL %0, %1 + %5:_(s32) = G_FNEG %4 + %6:_(s32) = G_FMUL %5, %2 + %7:_(s32) = G_FMUL %5, %3 + + $vgpr0 = COPY %5:_(s32) + $vgpr1 = COPY %6:_(s32) + $vgpr2 = COPY %7:_(s32) + +... diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -131,6 +131,7 @@ "AMDGPUAttributor.cpp", "AMDGPUCallLowering.cpp", "AMDGPUCodeGenPrepare.cpp", + "AMDGPUCombinerHelper.cpp" "AMDGPUCtorDtorLowering.cpp", "AMDGPUExportClustering.cpp", "AMDGPUFixFunctionBitcasts.cpp",