This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Improve frame index folding for pre-allocated objects
AbandonedPublic

Authored by arsenm on Nov 21 2016, 1:40 PM.

Download Raw Diff

Details

Reviewers

Summary

If the LocalStackSlotAlloction pass has already fixed the offset
for the object during operand folding, check if the offset is an
inline immediate.

There are a couple remaining issues separate from this patch
that need to be corrected before this can be committed.
LocalStackSlotAlloctation decides if it didn't need to put any
frame base registers in that it shouldn't create the pre-allocated
area, so no offsets are known.

Additionally there are code regressions when the base frame index
is folded into an instruction at the expensive of the offset.
e.g. an add fi, offset folds the frame index instead of the single
use offset, instead of materializing the frame index which is re-used
in multiple instructions.

Diff Detail

Event Timeline

arsenm updated this revision to Diff 78774.Nov 21 2016, 1:40 PM

arsenm retitled this revision from to AMDGPU: Improve frame index folding for pre-allocated objects.

arsenm updated this object.

arsenm added a subscriber: llvm-commits.

Herald added a reviewer: • tstellarAMD. · View Herald TranscriptNov 21 2016, 1:40 PM

Herald added subscribers: tony-tye, yaxunl, nhaehnle and 2 others. · View Herald Transcript

LGTM.

This revision is now accepted and ready to land.Nov 23 2016, 12:41 PM

t-tye added a subscriber: t-tye.Mar 22 2017, 6:38 PM

tony-tye removed a subscriber: tony-tye.Mar 22 2017, 6:50 PM

I think this isn't necessarily correct. I don't think the local frame object is at offset 0

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIFoldOperands.cpp

10 lines

SIInstrInfo.h

4 lines

SIInstrInfo.cpp

37 lines

Diff 78774

lib/Target/AMDGPU/SIFoldOperands.cpp

Show First 20 Lines • Show All 266 Lines • ▼ Show 20 Lines	for (MachineRegisterInfo::use_iterator
foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,		foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
CopiesToReplace, TII, TRI, MRI);		CopiesToReplace, TII, TRI, MRI);
}		}

return;		return;
}		}


bool FoldingImm = OpToFold.isImm();		bool FoldingImm = OpToFold.isImm() \|\| OpToFold.isFI();

// In order to fold immediates into copies, we need to change the		// In order to fold immediates into copies, we need to change the
// copy to a MOV.		// copy to a MOV.
if (FoldingImm && UseMI->isCopy()) {		if (FoldingImm && UseMI->isCopy()) {
unsigned DestReg = UseMI->getOperand(0).getReg();		unsigned DestReg = UseMI->getOperand(0).getReg();
const TargetRegisterClass *DestRC		const TargetRegisterClass *DestRC
= TargetRegisterInfo::isVirtualRegister(DestReg) ?		= TargetRegisterInfo::isVirtualRegister(DestReg) ?
MRI.getRegClass(DestReg) :		MRI.getRegClass(DestReg) :
Show All 10 Lines	if (FoldingImm && UseMI->isCopy()) {

// Don't fold into target independent nodes. Target independent opcodes		// Don't fold into target independent nodes. Target independent opcodes
// don't have defined register classes.		// don't have defined register classes.
if (UseDesc.isVariadic() \|\|		if (UseDesc.isVariadic() \|\|
UseDesc.OpInfo[UseOpIdx].RegClass == -1)		UseDesc.OpInfo[UseOpIdx].RegClass == -1)
return;		return;
}		}

if (!FoldingImm) {		if (!OpToFold.isImm()) {
tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);		tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);

// FIXME: We could try to change the instruction from 64-bit to 32-bit		// FIXME: We could try to change the instruction from 64-bit to 32-bit
// to enable more folding opportunites. The shrink operands pass		// to enable more folding opportunites. The shrink operands pass
// already does this.		// already does this.
return;		return;
}		}

▲ Show 20 Lines • Show All 167 Lines • ▼ Show 20 Lines

bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {		bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(*MF.getFunction()))		if (skipFunction(*MF.getFunction()))
return false;		return false;

const SISubtarget &ST = MF.getSubtarget<SISubtarget>();		const SISubtarget &ST = MF.getSubtarget<SISubtarget>();

MachineRegisterInfo &MRI = MF.getRegInfo();		MachineRegisterInfo &MRI = MF.getRegInfo();
		const MachineFrameInfo &MFI = MF.getFrameInfo();

const SIInstrInfo *TII = ST.getInstrInfo();		const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();		const SIRegisterInfo &TRI = TII->getRegisterInfo();

for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();		for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {		BI != BE; ++BI) {

MachineBasicBlock &MBB = *BI;		MachineBasicBlock &MBB = *BI;
MachineBasicBlock::iterator I, Next;		MachineBasicBlock::iterator I, Next;
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	for (I = MBB.begin(); I != MBB.end(); I = Next) {
// Folding immediates with more than one use will increase program size.		// Folding immediates with more than one use will increase program size.
// FIXME: This will also reduce register usage, which may be better		// FIXME: This will also reduce register usage, which may be better
// in some cases. A better heuristic is needed.		// in some cases. A better heuristic is needed.
for (MachineRegisterInfo::use_iterator		for (MachineRegisterInfo::use_iterator
Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();		Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
Use != E; ++Use) {		Use != E; ++Use) {
MachineInstr *UseMI = Use->getParent();		MachineInstr *UseMI = Use->getParent();

if (TII->isInlineConstant(OpToFold, OpSize)) {		if ((OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpSize)) \|\|
		(OpToFold.isFI() &&
		TII->isFrameIndexPreAllocInlineImm(MFI, OpToFold.getIndex()))) {
foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,		foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
CopiesToReplace, TII, TRI, MRI);		CopiesToReplace, TII, TRI, MRI);
} else {		} else {
if (++NumLiteralUses == 1) {		if (++NumLiteralUses == 1) {
NonInlineUse = &*Use;		NonInlineUse = &*Use;
NonInlineUseOpNo = Use.getOperandNo();		NonInlineUseOpNo = Use.getOperandNo();
}		}
}		}
▲ Show 20 Lines • Show All 47 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstrInfo.h

Show First 20 Lines • Show All 458 Lines • ▼ Show 20 Lines	public:
bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;		bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;

// Returns true if this operand could potentially require a 32-bit literal		// Returns true if this operand could potentially require a 32-bit literal
// operand, but not necessarily. A FrameIndex for example could resolve to an		// operand, but not necessarily. A FrameIndex for example could resolve to an
// inline immediate value that will not require an additional 4-bytes; this		// inline immediate value that will not require an additional 4-bytes; this
// assumes that it will.		// assumes that it will.
bool isLiteralConstantLike(const MachineOperand &MO, unsigned OpSize) const;		bool isLiteralConstantLike(const MachineOperand &MO, unsigned OpSize) const;

		/// \returns true if the object \p FI has already had its offset determined to
		/// be a valid inline immediate value.
		bool isFrameIndexPreAllocInlineImm(const MachineFrameInfo &MFI, int FI) const;

bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,		bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
const MachineOperand &MO) const;		const MachineOperand &MO) const;

/// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.		/// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
/// This function will return false if you pass it a 32-bit instruction.		/// This function will return false if you pass it a 32-bit instruction.
bool hasVALU32BitEncoding(unsigned Opcode) const;		bool hasVALU32BitEncoding(unsigned Opcode) const;

/// \brief Returns true if this operand uses the constant bus.		/// \brief Returns true if this operand uses the constant bus.
▲ Show 20 Lines • Show All 231 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 1,764 Lines • ▼ Show 20 Lines	case MachineOperand::MO_Register:
return Op0.getReg() == Op1.getReg();		return Op0.getReg() == Op1.getReg();
case MachineOperand::MO_Immediate:		case MachineOperand::MO_Immediate:
return Op0.getImm() == Op1.getImm();		return Op0.getImm() == Op1.getImm();
default:		default:
llvm_unreachable("Didn't expect to be comparing these operand types");		llvm_unreachable("Didn't expect to be comparing these operand types");
}		}
}		}

		bool SIInstrInfo::isFrameIndexPreAllocInlineImm(const MachineFrameInfo &MFI,
		int FI) const {
		// Check if LocalStackSlotAllocation has already determined the offset for
		// this frame index.
		if (!MFI.getUseLocalStackAllocationBlock() \|\| !MFI.isObjectPreAllocated(FI))
		return false;

		for (int I = 0, E = MFI.getLocalFrameObjectCount(); I != E; ++I) {
		int ObjFI;
		int64_t ObjOffset;
		std::tie(ObjFI, ObjOffset) = MFI.getLocalFrameObjectMap(I);
		if (ObjFI == FI) {
		assert(isUInt<32>(ObjOffset));
		dbgs() << "Found FI#" << FI << " at offset " << ObjOffset << '\n';
		return isInlineConstant(APInt(32, ObjOffset));
		}
		}

		return false;
		}

bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,		bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
const MachineOperand &MO) const {		const MachineOperand &MO) const {
const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];		const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];

assert(MO.isImm() \|\| MO.isTargetIndex() \|\| MO.isFI());		assert(MO.isImm() \|\| MO.isTargetIndex() \|\| MO.isFI());

if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)		if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
return true;		return true;

if (OpInfo.RegClass < 0)		if (OpInfo.RegClass < 0)
return false;		return false;

unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize();		if (MO.isFI()) {
if (isLiteralConstant(MO, OpSize))		assert(AMDGPU::getRegBitWidth(OpInfo.RegClass) == 32);

		if (RI.opCanUseInlineConstant(OpInfo.OperandType) &&
		isFrameIndexPreAllocInlineImm(MI.getParent()->getParent()->getFrameInfo(),
		MO.getIndex()))
		return true;

return RI.opCanUseLiteralConstant(OpInfo.OperandType);		return RI.opCanUseLiteralConstant(OpInfo.OperandType);
		}

		unsigned OpSize = AMDGPU::getRegBitWidth(OpInfo.RegClass) / 8;
		if (isLiteralConstant(MO, OpSize)) {
		return RI.opCanUseLiteralConstant(OpInfo.OperandType);
		}

return RI.opCanUseInlineConstant(OpInfo.OperandType);		return RI.opCanUseInlineConstant(OpInfo.OperandType);
}		}

bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {		bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
int Op32 = AMDGPU::getVOPe32(Opcode);		int Op32 = AMDGPU::getVOPe32(Opcode);
if (Op32 == -1)		if (Op32 == -1)
return false;		return false;
▲ Show 20 Lines • Show All 1,841 Lines • Show Last 20 Lines