Diff 384418

llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h

Show First 20 Lines • Show All 311 Lines • ▼ Show 20 Lines

template <typename LHS, typename RHS>		template <typename LHS, typename RHS>
inline BinaryOp_match<LHS, RHS, TargetOpcode::G_MUL, true>		inline BinaryOp_match<LHS, RHS, TargetOpcode::G_MUL, true>
m_GMul(const LHS &L, const RHS &R) {		m_GMul(const LHS &L, const RHS &R) {
return BinaryOp_match<LHS, RHS, TargetOpcode::G_MUL, true>(L, R);		return BinaryOp_match<LHS, RHS, TargetOpcode::G_MUL, true>(L, R);
}		}

template <typename LHS, typename RHS>		template <typename LHS, typename RHS>
		inline BinaryOp_match<LHS, RHS, TargetOpcode::G_FDIV, true>
		m_GFDiv(const LHS &L, const RHS &R) {
		return BinaryOp_match<LHS, RHS, TargetOpcode::G_FDIV, true>(L, R);
		}

		template <typename LHS, typename RHS>
inline BinaryOp_match<LHS, RHS, TargetOpcode::G_FADD, true>		inline BinaryOp_match<LHS, RHS, TargetOpcode::G_FADD, true>
m_GFAdd(const LHS &L, const RHS &R) {		m_GFAdd(const LHS &L, const RHS &R) {
return BinaryOp_match<LHS, RHS, TargetOpcode::G_FADD, true>(L, R);		return BinaryOp_match<LHS, RHS, TargetOpcode::G_FADD, true>(L, R);
}		}

template <typename LHS, typename RHS>		template <typename LHS, typename RHS>
inline BinaryOp_match<LHS, RHS, TargetOpcode::G_FMUL, true>		inline BinaryOp_match<LHS, RHS, TargetOpcode::G_FMUL, true>
m_GFMul(const LHS &L, const RHS &R) {		m_GFMul(const LHS &L, const RHS &R) {
▲ Show 20 Lines • Show All 131 Lines • ▼ Show 20 Lines	inline UnaryOp_match<SrcTy, TargetOpcode::G_FNEG> m_GFNeg(const SrcTy &Src) {
return UnaryOp_match<SrcTy, TargetOpcode::G_FNEG>(Src);		return UnaryOp_match<SrcTy, TargetOpcode::G_FNEG>(Src);
}		}

template <typename SrcTy>		template <typename SrcTy>
inline UnaryOp_match<SrcTy, TargetOpcode::COPY> m_Copy(SrcTy &&Src) {		inline UnaryOp_match<SrcTy, TargetOpcode::COPY> m_Copy(SrcTy &&Src) {
return UnaryOp_match<SrcTy, TargetOpcode::COPY>(std::forward<SrcTy>(Src));		return UnaryOp_match<SrcTy, TargetOpcode::COPY>(std::forward<SrcTy>(Src));
}		}

		template <typename SrcTy>
		inline UnaryOp_match<SrcTy, TargetOpcode::G_FSQRT> m_GFSqrt(const SrcTy &Src) {
		return UnaryOp_match<SrcTy, TargetOpcode::G_FSQRT>(Src);
		}

// General helper for generic MI compares, i.e. G_ICMP and G_FCMP		// General helper for generic MI compares, i.e. G_ICMP and G_FCMP
// TODO: Allow checking a specific predicate.		// TODO: Allow checking a specific predicate.
template <typename Pred_P, typename LHS_P, typename RHS_P, unsigned Opcode>		template <typename Pred_P, typename LHS_P, typename RHS_P, unsigned Opcode>
struct CompareOp_match {		struct CompareOp_match {
Pred_P P;		Pred_P P;
LHS_P L;		LHS_P L;
RHS_P R;		RHS_P R;

▲ Show 20 Lines • Show All 99 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUCombine.td

Show All 20 Lines


def uchar_to_float : GICombineRule<		def uchar_to_float : GICombineRule<
(defs root:$itofp),		(defs root:$itofp),
(match (wip_match_opcode G_UITOFP, G_SITOFP):$itofp,		(match (wip_match_opcode G_UITOFP, G_SITOFP):$itofp,
[{ return PostLegalizerHelper.matchUCharToFloat(*${itofp}); }]),		[{ return PostLegalizerHelper.matchUCharToFloat(*${itofp}); }]),
(apply [{ PostLegalizerHelper.applyUCharToFloat(*${itofp}); }])>;		(apply [{ PostLegalizerHelper.applyUCharToFloat(*${itofp}); }])>;


		def rcp_sqrt_to_rsq : GICombineRule<
		(defs root:$rcp, build_fn_matchinfo:$matchinfo),
		(match (wip_match_opcode G_INTRINSIC, G_FDIV, G_FSQRT):$rcp,
		[{ return PostLegalizerHelper.matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
		(apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;


def cvt_f32_ubyteN_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::CvtF32UByteMatchInfo">;		def cvt_f32_ubyteN_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::CvtF32UByteMatchInfo">;

def cvt_f32_ubyteN : GICombineRule<		def cvt_f32_ubyteN : GICombineRule<
(defs root:$cvt_f32_ubyteN, cvt_f32_ubyteN_matchdata:$matchinfo),		(defs root:$cvt_f32_ubyteN, cvt_f32_ubyteN_matchdata:$matchinfo),
(match (wip_match_opcode G_AMDGPU_CVT_F32_UBYTE0,		(match (wip_match_opcode G_AMDGPU_CVT_F32_UBYTE0,
G_AMDGPU_CVT_F32_UBYTE1,		G_AMDGPU_CVT_F32_UBYTE1,
G_AMDGPU_CVT_F32_UBYTE2,		G_AMDGPU_CVT_F32_UBYTE2,
G_AMDGPU_CVT_F32_UBYTE3):$cvt_f32_ubyteN,		G_AMDGPU_CVT_F32_UBYTE3):$cvt_f32_ubyteN,
Show All 34 Lines	def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
"AMDGPUGenPreLegalizerCombinerHelper", [all_combines, clamp_i64_to_i16]> {		"AMDGPUGenPreLegalizerCombinerHelper", [all_combines, clamp_i64_to_i16]> {
let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";		let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
let StateClass = "AMDGPUPreLegalizerCombinerHelperState";		let StateClass = "AMDGPUPreLegalizerCombinerHelperState";
}		}

def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<		def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
"AMDGPUGenPostLegalizerCombinerHelper",		"AMDGPUGenPostLegalizerCombinerHelper",
[all_combines, gfx6gfx7_combines,		[all_combines, gfx6gfx7_combines,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize]> {		uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize,
		rcp_sqrt_to_rsq]> {
let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";		let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
let StateClass = "AMDGPUPostLegalizerCombinerHelperState";		let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
let AdditionalArguments = [];		let AdditionalArguments = [];
}		}

def AMDGPURegBankCombinerHelper : GICombinerHelper<		def AMDGPURegBankCombinerHelper : GICombinerHelper<
"AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain]> {		"AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain]> {
let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";		let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
let StateClass = "AMDGPURegBankCombinerHelperState";		let StateClass = "AMDGPURegBankCombinerHelperState";
let AdditionalArguments = [];		let AdditionalArguments = [];
}		}

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

	Show First 20 Lines • Show All 650 Lines • ▼ Show 20 Lines
	>;			>;
	} // AddedComplexity.			} // AddedComplexity.

	class RcpPat<Instruction RcpInst, ValueType vt> : AMDGPUPat <			class RcpPat<Instruction RcpInst, ValueType vt> : AMDGPUPat <
	(fdiv FP_ONE, vt:$src),			(fdiv FP_ONE, vt:$src),
	(RcpInst $src)			(RcpInst $src)
	>;			>;

	class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat <
	(AMDGPUrcp (fsqrt vt:$src)),
	(RsqInst $src)
	>;

	// Instructions which select to the same v_min_f*			// Instructions which select to the same v_min_f*
	def fminnum_like : PatFrags<(ops node:$src0, node:$src1),			def fminnum_like : PatFrags<(ops node:$src0, node:$src1),
	[(fminnum_ieee node:$src0, node:$src1),			[(fminnum_ieee node:$src0, node:$src1),
	(fminnum node:$src0, node:$src1)]			(fminnum node:$src0, node:$src1)]
	>;			>;

	// Instructions which select to the same v_max_f*			// Instructions which select to the same v_max_f*
	def fmaxnum_like : PatFrags<(ops node:$src0, node:$src1),			def fmaxnum_like : PatFrags<(ops node:$src0, node:$src1),
	Show All 23 Lines

llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp

Show All 14 Lines
#include "AMDGPULegalizerInfo.h"		#include "AMDGPULegalizerInfo.h"
#include "GCNSubtarget.h"		#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"		#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"		#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"		#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"		#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"		#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"		#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
		#include "llvm/IR/IntrinsicsAMDGPU.h"
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code -#include "llvm/IR/IntrinsicsAMDGPU.h" Lint: Pre-merge checks: clang-format: please reformat the code ``` -#include "llvm/IR/IntrinsicsAMDGPU.h" ```
#include "llvm/CodeGen/MachineDominators.h"		#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/TargetPassConfig.h"		#include "llvm/CodeGen/TargetPassConfig.h"
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code +#include "llvm/IR/IntrinsicsAMDGPU.h" Lint: Pre-merge checks: clang-format: please reformat the code ``` +#include "llvm/IR/IntrinsicsAMDGPU.h" ```
#include "llvm/Target/TargetMachine.h"		#include "llvm/Target/TargetMachine.h"

#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"		#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"

using namespace llvm;		using namespace llvm;
using namespace MIPatternMatch;		using namespace MIPatternMatch;

class AMDGPUPostLegalizerCombinerHelper {		class AMDGPUPostLegalizerCombinerHelper {
Show All 18 Lines	public:
// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize		// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);		bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,		void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
const FMinFMaxLegacyInfo &Info);		const FMinFMaxLegacyInfo &Info);

bool matchUCharToFloat(MachineInstr &MI);		bool matchUCharToFloat(MachineInstr &MI);
void applyUCharToFloat(MachineInstr &MI);		void applyUCharToFloat(MachineInstr &MI);

		bool matchRcpSqrtToRsq(MachineInstr &MI,
		std::function<void(MachineIRBuilder &)> &MatchInfo);
		bool matchSqrtRcpToRsq(MachineInstr &MI,
		std::function<void(MachineIRBuilder &)> &MatchInfo);

// FIXME: Should be able to have 2 separate matchdatas rather than custom		// FIXME: Should be able to have 2 separate matchdatas rather than custom
// struct boilerplate.		// struct boilerplate.
struct CvtF32UByteMatchInfo {		struct CvtF32UByteMatchInfo {
Register CvtVal;		Register CvtVal;
unsigned ShiftOffset;		unsigned ShiftOffset;
};		};

bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);		bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
▲ Show 20 Lines • Show All 129 Lines • ▼ Show 20 Lines	if (Ty == S32) {
auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},		auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
{SrcReg}, MI.getFlags());		{SrcReg}, MI.getFlags());
B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());		B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
}		}

MI.eraseFromParent();		MI.eraseFromParent();
}		}

		bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq(
		MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {

		auto getRcpSrc = [=](const MachineInstr &MI) {
		if (MI.getOpcode() == TargetOpcode::G_INTRINSIC &&
		MI.getIntrinsicID() == Intrinsic::amdgcn_rcp)
		return MRI.getVRegDef(MI.getOperand(2).getReg());

		MachineInstr *DivSrcMI = nullptr;
		mi_match(MI.getOperand(0).getReg(), MRI,
		foadUnsubmitted Not Done Reply Inline Actions I still think it's wrong to handle G_FDIV here. it's unnecessary, because we are running post-legalizer and G_FDIV will always get legalized to something else. even if G_FDIV did appear here, I don't think it should be combined into an rsq instruction without checking for all the fast/unsafe math flags, like in AMDGPULegalizerInfo::legalizeFastUnsafeFDIV. I think we just need an IR test to check that `fdiv float 1.0, %x1` with appropriate fast math flags get combined with `@llvm.fsqrt` to generate a v_rsq instruction. foad: I still think it's wrong to handle G_FDIV here. - it's unnecessary, because we are running post…
		m_GFDiv(m_SpecificICst(1), m_MInstr(DivSrcMI)));
		return DivSrcMI;
		};

		auto getSqrtSrc = [=](const MachineInstr &MI) {
		MachineInstr *SqrtSrcMI = nullptr;
		mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI)));
		return SqrtSrcMI;
		};

		MachineInstr RcpSrcMI = nullptr, SqrtSrcMI = nullptr;
		if ((RcpSrcMI = getRcpSrc(MI)) &&
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code - if ((RcpSrcMI = getRcpSrc(MI)) && - (SqrtSrcMI = getSqrtSrc(RcpSrcMI))) { + if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(RcpSrcMI))) { Lint: Pre-merge checks: clang-format: please reformat the code ``` - if ((RcpSrcMI = getRcpSrc(MI)) &&…
		(SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
		MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {
		B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
		foadUnsubmitted Not Done Reply Inline Actions I'm not sure whether it's best to copy flags from MI or RcpSrcMI or somehow combine both. I guess this is fine for now. foad: I'm not sure whether it's best to copy flags from MI or RcpSrcMI or somehow combine both. I…
		.addUse(SqrtSrcMI->getOperand(0).getReg())
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code - .addUse(SqrtSrcMI->getOperand(0).getReg()) - .setMIFlags(MI.getFlags()); + .addUse(SqrtSrcMI->getOperand(0).getReg()) + .setMIFlags(MI.getFlags()); Lint: Pre-merge checks: clang-format: please reformat the code ``` - .addUse(SqrtSrcMI->getOperand(0).getReg())…
		.setMIFlags(MI.getFlags());
		};
		return true;
		}

		if ((SqrtSrcMI = getSqrtSrc(MI)) &&
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code - if ((SqrtSrcMI = getSqrtSrc(MI)) && - (RcpSrcMI = getRcpSrc(SqrtSrcMI))) { + if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(SqrtSrcMI))) { Lint: Pre-merge checks: clang-format: please reformat the code ``` - if ((SqrtSrcMI = getSqrtSrc(MI)) &&…
		(RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
		MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {
		B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
		.addUse(RcpSrcMI->getOperand(0).getReg())
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code - .addUse(RcpSrcMI->getOperand(0).getReg()) - .setMIFlags(MI.getFlags()); + .addUse(RcpSrcMI->getOperand(0).getReg()) + .setMIFlags(MI.getFlags()); Lint: Pre-merge checks: clang-format: please reformat the code ``` - .addUse(RcpSrcMI->getOperand(0).getReg())…
		.setMIFlags(MI.getFlags());
		};
		return true;
		}

		return false;
		}


		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code - Lint: Pre-merge checks: clang-format: please reformat the code ``` - ```
bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(		bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {		MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
Register SrcReg = MI.getOperand(1).getReg();		Register SrcReg = MI.getOperand(1).getReg();

// Look through G_ZEXT.		// Look through G_ZEXT.
mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));		mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));

Register Src0;		Register Src0;
▲ Show 20 Lines • Show All 192 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/CaymanInstructions.td

	Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
	def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;			def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
	def LOG_IEEE_cm : LOG_IEEE_Common<0x83>;			def LOG_IEEE_cm : LOG_IEEE_Common<0x83>;
	def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;			def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
	def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;			def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
	def SIN_cm : SIN_Common<0x8D>;			def SIN_cm : SIN_Common<0x8D>;
	def COS_cm : COS_Common<0x8E>;			def COS_cm : COS_Common<0x8E>;
	} // End isVector = 1			} // End isVector = 1

	def : RsqPat<RECIPSQRT_IEEE_cm, f32>;

	def : SqrtPat<RECIPSQRT_IEEE_cm, RECIP_IEEE_cm>;			def : SqrtPat<RECIPSQRT_IEEE_cm, RECIP_IEEE_cm>;

	def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;			def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;

	defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;			defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;

	// RECIP_UINT emulation for Cayman			// RECIP_UINT emulation for Cayman
	// The multiplication scales from [0,1) to the unsigned integer range,			// The multiplication scales from [0,1) to the unsigned integer range,
	▲ Show 20 Lines • Show All 162 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/EvergreenInstructions.td

	Show First 20 Lines • Show All 120 Lines • ▼ Show 20 Lines
	def MULHI_UINT24_eg : MULHI_UINT24_Common<0xb2>;			def MULHI_UINT24_eg : MULHI_UINT24_Common<0xb2>;

	def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;			def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
	def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;			def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
	def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;			def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
	def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;			def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
	def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;			def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
	def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;			def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
	def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
	def : SqrtPat<RECIPSQRT_IEEE_eg, RECIP_IEEE_eg>;			def : SqrtPat<RECIPSQRT_IEEE_eg, RECIP_IEEE_eg>;

	def SIN_eg : SIN_Common<0x8D>;			def SIN_eg : SIN_Common<0x8D>;
	def COS_eg : COS_Common<0x8E>;			def COS_eg : COS_Common<0x8E>;

	def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;			def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
	} // End SubtargetPredicate = isEG			} // End SubtargetPredicate = isEG

	▲ Show 20 Lines • Show All 767 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/R600Instructions.td

Show First 20 Lines • Show All 1,269 Lines • ▼ Show 20 Lines	let Predicates = [isR600] in {
def MULHI_INT_r600 : MULHI_INT_Common<0x74>;		def MULHI_INT_r600 : MULHI_INT_Common<0x74>;
def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>;		def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>;
def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>;		def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>;
def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>;		def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>;

defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;		defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;		def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;

def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
def : SqrtPat<RECIPSQRT_IEEE_r600, RECIP_IEEE_r600>;		def : SqrtPat<RECIPSQRT_IEEE_r600, RECIP_IEEE_r600>;

def R600_ExportSwz : ExportSwzInst {		def R600_ExportSwz : ExportSwzInst {
let Word1{20-17} = 0; // BURST_COUNT		let Word1{20-17} = 0; // BURST_COUNT
let Word1{21} = eop;		let Word1{21} = eop;
let Word1{22} = 0; // VALID_PIXEL_MODE		let Word1{22} = 0; // VALID_PIXEL_MODE
let Word1{30-23} = inst;		let Word1{30-23} = inst;
let Word1{31} = 1; // BARRIER		let Word1{31} = 1; // BARRIER
▲ Show 20 Lines • Show All 514 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstructions.td

	Show First 20 Lines • Show All 821 Lines • ▼ Show 20 Lines
	>;			>;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// VOP1 Patterns			// VOP1 Patterns
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	let OtherPredicates = [UnsafeFPMath] in {			let OtherPredicates = [UnsafeFPMath] in {

	//defm : RsqPat<V_RSQ_F32_e32, f32>;

	def : RsqPat<V_RSQ_F32_e32, f32>;
	arsenmUnsubmitted Not Done Reply Inline Actions Can probably delete the definition of RsqPat too arsenm: Can probably delete the definition of RsqPat too

	// Convert (x - floor(x)) to fract(x)			// Convert (x - floor(x)) to fract(x)
				arsenmUnsubmitted Not Done Reply Inline Actions I don't understand this change. Are you saying this is a dead selection pattern for the DAG? Should we be doing this in the combiner instead and just delete this? That way we could consider the fast math flags and not rely on the function attribute arsenm: I don't understand this change. Are you saying this is a dead selection pattern for the DAG?
				matejamAuthorUnsubmitted Done Reply Inline Actions I am, with or without this pattern SDAG combines v_sqrt + v_rcp into v_rsq. I'm not sure which would be better to leave this as a pattern or write a combiner for this. In fact SDAG doesn't even need any flags to combine into v_rsq. matejam: I am, with or without this pattern SDAG combines v_sqrt + v_rcp into v_rsq. I'm not sure which…
				arsenmUnsubmitted Not Done Reply Inline Actions If this is a dead pattern in the DAG, I would just delete it. When you say without flags, I assume you mean with the unsafe attribute? I'm a bit worried this pattern is just broken as-is. This depends on the denormal mode, and also could be augmented to use the per-instruction flags. I think it's safer to move this to a combine. arsenm: If this is a dead pattern in the DAG, I would just delete it. When you say without flags, I…
				matejamAuthorUnsubmitted Done Reply Inline Actions I tried deleting the SDAG combiner (SITargetLowering::performRcpCombine()) for v_rsq and then SDAG uses this pattern instead. So I assume it's either this pattern without the SDAG rcp combiner or the SDAG rcp combiner + new GlobalISel combiner? matejam: I tried deleting the SDAG combiner (SITargetLowering::performRcpCombine()) for v_rsq and then…
	def : GCNPat <			def : GCNPat <
	(f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),			(f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
	(f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),			(f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
	(V_FRACT_F32_e64 $mods, $x)			(V_FRACT_F32_e64 $mods, $x)
	>;			>;

	// Convert (x + (-floor(x))) to fract(x)			// Convert (x + (-floor(x))) to fract(x)
	def : GCNPat <			def : GCNPat <
	▲ Show 20 Lines • Show All 2,049 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir

This file was added.

				# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
				# RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - \| FileCheck -check-prefix=GCN %s

				---
				name: rcp_sqrt_test
				body: \|
				bb.0:
				liveins: $sgpr0

				; CHECK: $vgpr0 = COPY %3
				; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
				; GCN-LABEL: name: rcp_sqrt_test
				; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
				; GCN: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[INT]](s32)
				; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
				%0:_(s32) = COPY $sgpr0
				arsenmUnsubmitted Not Done Reply Inline Actions This looks like it lost the fast math flags arsenm: This looks like it lost the fast math flags
				%2:_(s32) = G_FSQRT %0:_
				%3:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %2:_(s32)
				$vgpr0 = COPY %3:_(s32)
				SI_RETURN_TO_EPILOG implicit $vgpr0

				...


				---
				name: div_sqrt_test
				body: \|
				bb.0:
				liveins: $sgpr0

				; CHECK: $vgpr0 = COPY %3
				; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
				; GCN-LABEL: name: div_sqrt_test
				; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
				; GCN: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[INT]](s32)
				; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
				%0:_(s32) = COPY $sgpr0
				%2:_(s32) = G_FSQRT %0:_
				%1:_(s32) = G_CONSTANT i32 1
				%3:_(s32) = afn G_FDIV %1, %2:_(s32)
				$vgpr0 = COPY %3:_(s32)
				SI_RETURN_TO_EPILOG implicit $vgpr0

				...

				---
				name: sqrt_rcp_test
				body: \|
				bb.0:
				liveins: $sgpr0

				; GCN-LABEL: name: sqrt_rcp_test
				; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
				; GCN: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[INT]](s32)
				; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
				%0:_(s32) = COPY $sgpr0
				%2:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %0:_(s32)
				%3:_(s32) = G_FSQRT %2:_
				$vgpr0 = COPY %3:_(s32)
				SI_RETURN_TO_EPILOG implicit $vgpr0

				...


				---
				name: sqrt_div_test
				body: \|
				bb.0:
				liveins: $sgpr0

				; GCN-LABEL: name: sqrt_div_test
				; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
				; GCN: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[INT]](s32)
				; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
				%0:_(s32) = COPY $sgpr0
				%1:_(s32) = G_CONSTANT i32 1
				%2:_(s32) = afn G_FDIV %1, %0:_(s32)
				%3:_(s32) = G_FSQRT %2:_
				$vgpr0 = COPY %3:_(s32)
				SI_RETURN_TO_EPILOG implicit $vgpr0

				...

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][GlobalISel] Code quality: Combine V_RSQ
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 384418

llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h

llvm/lib/Target/AMDGPU/AMDGPUCombine.td

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp

llvm/lib/Target/AMDGPU/CaymanInstructions.td

llvm/lib/Target/AMDGPU/EvergreenInstructions.td

llvm/lib/Target/AMDGPU/R600Instructions.td

llvm/lib/Target/AMDGPU/SIInstructions.td

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][GlobalISel] Code quality: Combine V_RSQClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 384418

llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h

llvm/lib/Target/AMDGPU/AMDGPUCombine.td

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp

llvm/lib/Target/AMDGPU/CaymanInstructions.td

llvm/lib/Target/AMDGPU/EvergreenInstructions.td

llvm/lib/Target/AMDGPU/R600Instructions.td

llvm/lib/Target/AMDGPU/SIInstructions.td

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir

[AMDGPU][GlobalISel] Code quality: Combine V_RSQ
ClosedPublic