This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Refactor immediate folding logic
ClosedPublic

Authored by arsenm on Nov 20 2016, 1:39 PM.

Download Raw Diff

Details

Reviewers

Summary

Change the logic for when to fold immediates to
consider the destination operand rather than the
source of the materializing mov instruction.

No change yet, but this will allow for correctly handling
i16/f16 operands. Since 32-bit moves are used to materialize
constants for these, the same bitvalue will not be in the
register.

Diff Detail

Event Timeline

arsenm updated this revision to Diff 78668.Nov 20 2016, 1:39 PM

arsenm retitled this revision from to AMDGPU: Refactor immediate folding logic.

arsenm updated this object.

arsenm added a subscriber: llvm-commits.

Herald added a reviewer: • tstellarAMD. · View Herald TranscriptNov 20 2016, 1:39 PM

Herald added subscribers: tony-tye, yaxunl, nhaehnle and 2 others. · View Herald Transcript

This will help for short term making 16-bit operands correct, but longer term it might be better to look at use instructions and finding the operand defs. Right now there seem to be some mis-optimizations when multiple operands might be able to be folded into the same instruction, and the wrong operand is selected

LGTM.

This revision is now accepted and ready to land.Nov 29 2016, 11:11 AM

r288184

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIFoldOperands.cpp

65 lines

Diff 78668

lib/Target/AMDGPU/SIFoldOperands.cpp

Show First 20 Lines • Show All 518 Lines • ▼ Show 20 Lines	for (I = MBB.begin(); I != MBB.end(); I = Next) {
// %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3		// %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3
// ...		// ...
// %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use>		// %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use>
MachineOperand &Dst = MI.getOperand(0);		MachineOperand &Dst = MI.getOperand(0);
if (Dst.isReg() &&		if (Dst.isReg() &&
!TargetRegisterInfo::isVirtualRegister(Dst.getReg()))		!TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
continue;		continue;

// Folding immediates with more than one use will increase program size.
// FIXME: This will also reduce register usage, which may be better
// in some cases. A better heuristic is needed.
if (FoldingImm && !TII->isInlineConstant(MI, OpToFold, OpSize) &&
!MRI.hasOneUse(MI.getOperand(0).getReg()))
continue;


// We need mutate the operands of new mov instructions to add implicit		// We need mutate the operands of new mov instructions to add implicit
// uses of EXEC, but adding them invalidates the use_iterator, so defer		// uses of EXEC, but adding them invalidates the use_iterator, so defer
// this.		// this.
SmallVector<MachineInstr *, 4> CopiesToReplace;		SmallVector<MachineInstr *, 4> CopiesToReplace;

std::vector<FoldCandidate> FoldList;		std::vector<FoldCandidate> FoldList;
		if (FoldingImm) {
		unsigned NumLiteralUses = 0;
		MachineOperand *NonInlineUse = nullptr;
		int NonInlineUseOpNo = -1;

		// Try to fold any inline immediate uses, and then only fold other
		// constants if they have one use.
		//
		// The legality of the inline immediate must be checked based on the use
		// operand, not the defining instruction, because 32-bit instructions
		// with 32-bit inline immediate sources may be used to materialize
		// constants used in 16-bit operands.
		//
		// e.g. it is unsafe to fold:
		// s_mov_b32 s0, 1.0 // materializes 0x3f800000
		// v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00

		// Folding immediates with more than one use will increase program size.
		// FIXME: This will also reduce register usage, which may be better
		// in some cases. A better heuristic is needed.
for (MachineRegisterInfo::use_iterator		for (MachineRegisterInfo::use_iterator
Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();		Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
Use != E; ++Use) {		Use != E; ++Use) {
		MachineInstr *UseMI = Use->getParent();

		if (TII->isInlineConstant(OpToFold, OpSize)) {
		foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
		CopiesToReplace, TII, TRI, MRI);
		} else {
		if (++NumLiteralUses == 1) {
		NonInlineUse = &*Use;
		NonInlineUseOpNo = Use.getOperandNo();
		}
		}
		}

		if (NumLiteralUses == 1) {
		MachineInstr *UseMI = NonInlineUse->getParent();
		foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList,
		CopiesToReplace, TII, TRI, MRI);
		}
		} else {
		// Folding register.
		for (MachineRegisterInfo::use_iterator
		Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
		Use != E; ++Use) {
MachineInstr *UseMI = Use->getParent();		MachineInstr *UseMI = Use->getParent();

foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,		foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
CopiesToReplace, TII, TRI, MRI);		CopiesToReplace, TII, TRI, MRI);
}		}
		}

// Make sure we add EXEC uses to any new v_mov instructions created.		// Make sure we add EXEC uses to any new v_mov instructions created.
for (MachineInstr *Copy : CopiesToReplace)		for (MachineInstr *Copy : CopiesToReplace)
Copy->addImplicitDefUseOperands(MF);		Copy->addImplicitDefUseOperands(MF);

for (FoldCandidate &Fold : FoldList) {		for (FoldCandidate &Fold : FoldList) {
if (updateOperand(Fold, TRI)) {		if (updateOperand(Fold, TRI)) {
// Clear kill flags.		// Clear kill flags.
Show All 21 Lines