This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Make SIInsertWaits about a factor of 4 faster
ClosedPublic

Authored by arsenm on Sep 25 2015, 5:52 PM.

Download Raw Diff

Details

Reviewers

Summary

This was the slowest target custom pass and was spending 80%
of the time in getMinimalPhysRegClass which was called
for every register operand.

Try to use the statically known register class when possible from
the instruction's MCOperandInfo. There are a few pseudo instructions
which are not well behaved with unknown register classes which still
require the expensive physical register class search.

There are a few other possibilities for making this even faster,
such as not inspecting implicit operands. For now those are checked
because it is technically possible to have a scalar load into
exec or vcc which can be implicitly used.

Diff Detail

Event Timeline

arsenm updated this revision to Diff 35787.Sep 25 2015, 5:52 PM

arsenm retitled this revision from to AMDGPU: Make SIInsertWaits about a factor of 4 faster.

arsenm updated this object.

arsenm added a reviewer: • tstellarAMD.

arsenm added a subscriber: llvm-commits.

Herald added a subscriber: arsenm. · View Herald TranscriptSep 25 2015, 5:52 PM

LGTM.

This revision is now accepted and ready to land.Oct 1 2015, 1:40 PM

r249079

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIInsertWaits.cpp

45 lines

SIRegisterInfo.cpp

2 lines

Diff 35787

lib/Target/AMDGPU/SIInsertWaits.cpp

Show First 20 Lines • Show All 85 Lines • ▼ Show 20 Lines	private:

/// \brief Get increment/decrement amount for this instruction.		/// \brief Get increment/decrement amount for this instruction.
Counters getHwCounts(MachineInstr &MI);		Counters getHwCounts(MachineInstr &MI);

/// \brief Is operand relevant for async execution?		/// \brief Is operand relevant for async execution?
bool isOpRelevant(MachineOperand &Op);		bool isOpRelevant(MachineOperand &Op);

/// \brief Get register interval an operand affects.		/// \brief Get register interval an operand affects.
RegInterval getRegInterval(MachineOperand &Op);		RegInterval getRegInterval(const TargetRegisterClass *RC,
		const MachineOperand &Reg) const;

/// \brief Handle instructions async components		/// \brief Handle instructions async components
void pushInstruction(MachineBasicBlock &MBB,		void pushInstruction(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I);		MachineBasicBlock::iterator I);

/// \brief Insert the actual wait instruction		/// \brief Insert the actual wait instruction
bool insertWait(MachineBasicBlock &MBB,		bool insertWait(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,		MachineBasicBlock::iterator I,
Show All 34 Lines
const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };		const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };		const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };

FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {		FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
return new SIInsertWaits(tm);		return new SIInsertWaits(tm);
}		}

Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {		Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
		uint64_t TSFlags = MI.getDesc().TSFlags;
uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
Counters Result = { { 0, 0, 0 } };		Counters Result = { { 0, 0, 0 } };

Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);		Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);

// Only consider stores or EXP for EXP_CNT		// Only consider stores or EXP for EXP_CNT
Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&		Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
(MI.getOpcode() == AMDGPU::EXP \|\| MI.getDesc().mayStore()));		(MI.getOpcode() == AMDGPU::EXP \|\| MI.getDesc().mayStore()));

// LGKM may uses larger values		// LGKM may uses larger values
if (TSFlags & SIInstrFlags::LGKM_CNT) {		if (TSFlags & SIInstrFlags::LGKM_CNT) {

if (TII->isSMRD(MI.getOpcode())) {		if (TII->isSMRD(MI.getOpcode())) {

if (MI.getNumOperands() != 0) {		if (MI.getNumOperands() != 0) {
MachineOperand &Op = MI.getOperand(0);		MachineOperand &Op = MI.getOperand(0);
assert(Op.isReg() && "First LGKM operand must be a register!");		assert(Op.isReg() && "First LGKM operand must be a register!");

unsigned Reg = Op.getReg();

// XXX - What if this is a write into a super register?		// XXX - What if this is a write into a super register?
unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();		const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
		unsigned Size = RC->getSize();
Result.Named.LGKM = Size > 4 ? 2 : 1;		Result.Named.LGKM = Size > 4 ? 2 : 1;
} else {		} else {
// s_dcache_inv etc. do not have a a destination register. Assume we		// s_dcache_inv etc. do not have a a destination register. Assume we
// want a wait on these.		// want a wait on these.
// XXX - What is the right value?		// XXX - What is the right value?
Result.Named.LGKM = 1;		Result.Named.LGKM = 1;
}		}
} else {		} else {
// DS		// DS
Result.Named.LGKM = 1;		Result.Named.LGKM = 1;
}		}

} else {		} else {
Result.Named.LGKM = 0;		Result.Named.LGKM = 0;
}		}

return Result;		return Result;
}		}

bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {		bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {

// Constants are always irrelevant		// Constants are always irrelevant
if (!Op.isReg())		if (!Op.isReg() \|\| !TRI->isInAllocatableClass(Op.getReg()))
return false;		return false;

// Defines are always relevant		// Defines are always relevant
if (Op.isDef())		if (Op.isDef())
return true;		return true;

// For exports all registers are relevant		// For exports all registers are relevant
MachineInstr &MI = *Op.getParent();		MachineInstr &MI = *Op.getParent();
Show All 32 Lines	for (MachineInstr::mop_iterator I = MI.operands_begin(),

if (I->isReg() && I->isUse())		if (I->isReg() && I->isUse())
return Op.isIdenticalTo(*I);		return Op.isIdenticalTo(*I);
}		}

return false;		return false;
}		}

RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {		RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
		const MachineOperand &Reg) const {
if (!Op.isReg() \|\| !TRI->isInAllocatableClass(Op.getReg()))		unsigned Size = RC->getSize();
return std::make_pair(0, 0);

unsigned Reg = Op.getReg();
unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();

assert(Size >= 4);		assert(Size >= 4);

RegInterval Result;		RegInterval Result;
Result.first = TRI->getEncodingValue(Reg);		Result.first = TRI->getEncodingValue(Reg.getReg());
Result.second = Result.first + Size / 4;		Result.second = Result.first + Size / 4;

return Result;		return Result;
}		}

void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,		void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) {		MachineBasicBlock::iterator I) {

▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
}		}

// Remember which export instructions we have seen		// Remember which export instructions we have seen
if (Increment.Named.EXP) {		if (Increment.Named.EXP) {
ExpInstrTypesSeen \|= I->getOpcode() == AMDGPU::EXP ? 1 : 2;		ExpInstrTypesSeen \|= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
}		}

for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {		for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {

MachineOperand &Op = I->getOperand(i);		MachineOperand &Op = I->getOperand(i);
if (!isOpRelevant(Op))		if (!isOpRelevant(Op))
continue;		continue;

RegInterval Interval = getRegInterval(Op);		const TargetRegisterClass RC = TII->getOpRegClass(I, i);
		RegInterval Interval = getRegInterval(RC, Op);
for (unsigned j = Interval.first; j < Interval.second; ++j) {		for (unsigned j = Interval.first; j < Interval.second; ++j) {

// Remember which registers we define		// Remember which registers we define
if (Op.isDef())		if (Op.isDef())
DefinedRegs[j] = Limit;		DefinedRegs[j] = Limit;

// and which one we are using		// and which one we are using
if (Op.isUse())		if (Op.isUse())
▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines	Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
Counters Result = ZeroCounts;		Counters Result = ZeroCounts;

// S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,		// S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
// but we also want to wait for any other outstanding transfers before		// but we also want to wait for any other outstanding transfers before
// signalling other hardware blocks		// signalling other hardware blocks
if (MI.getOpcode() == AMDGPU::S_SENDMSG)		if (MI.getOpcode() == AMDGPU::S_SENDMSG)
return LastIssued;		return LastIssued;

// For each register affected by this		// For each register affected by this instruction increase the result
// instruction increase the result sequence		// sequence.
		//
		// TODO: We could probably just look at explicit operands if we removed VCC /
		// EXEC from SMRD dest reg classes.
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {		for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {

MachineOperand &Op = MI.getOperand(i);		MachineOperand &Op = MI.getOperand(i);
RegInterval Interval = getRegInterval(Op);		if (!Op.isReg() \|\| !TRI->isInAllocatableClass(Op.getReg()))
		continue;

		const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
		RegInterval Interval = getRegInterval(RC, Op);
for (unsigned j = Interval.first; j < Interval.second; ++j) {		for (unsigned j = Interval.first; j < Interval.second; ++j) {

if (Op.isDef()) {		if (Op.isDef()) {
increaseCounters(Result, UsedRegs[j]);		increaseCounters(Result, UsedRegs[j]);
increaseCounters(Result, DefinedRegs[j]);		increaseCounters(Result, DefinedRegs[j]);
}		}

if (Op.isUse())		if (Op.isUse())
▲ Show 20 Lines • Show All 74 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIRegisterInfo.cpp

Show First 20 Lines • Show All 320 Lines • ▼ Show 20 Lines	switch (MI->getOpcode()) {
}		}
}		}
}		}

unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {		unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
return getEncodingValue(Reg) & 0xff;		return getEncodingValue(Reg) & 0xff;
}		}

		// FIXME: This is very slow. It might be worth creating a map from physreg to
		// register class.
const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {		const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
assert(!TargetRegisterInfo::isVirtualRegister(Reg));		assert(!TargetRegisterInfo::isVirtualRegister(Reg));

static const TargetRegisterClass *BaseClasses[] = {		static const TargetRegisterClass *BaseClasses[] = {
&AMDGPU::VGPR_32RegClass,		&AMDGPU::VGPR_32RegClass,
&AMDGPU::SReg_32RegClass,		&AMDGPU::SReg_32RegClass,
&AMDGPU::VReg_64RegClass,		&AMDGPU::VReg_64RegClass,
&AMDGPU::SReg_64RegClass,		&AMDGPU::SReg_64RegClass,
▲ Show 20 Lines • Show All 218 Lines • Show Last 20 Lines