This is an archive of the discontinued LLVM Phabricator instance.

buffer_load_dword v3, off, s[72:75], s70 offset:1444 ; 16-byte Folded Reload
                                ; encoding: [0xa4,0x05,0x30,0xe0,0x00,0x03,0x12,0x46]
s_waitcnt vmcnt(0)              ; encoding: [0x70,0x0f,0x8c,0xbf]
buffer_load_dword v4, off, s[72:75], s70 offset:1448 ; 16-byte Folded Reload
                                ; encoding: [0xa8,0x05,0x30,0xe0,0x00,0x04,0x12,0x46]
s_waitcnt vmcnt(0)              ; encoding: [0x70,0x0f,0x8c,0xbf]

etc., so you actually get 12 bytes per dword. Not sure if that's a problem, especially since those waits are really wrong anyway (perhaps the wait insertion gets confused by the register/subregister relationship?).

arsenm added inline comments.Aug 11 2016, 12:04 PM

lib/Target/AMDGPU/SIInstructions.td
1962–1963	I'm not really sure what to do about waitcnts. It doesn't really matter for correctness, since the branch relax pass right now runs after these should be eliminated (these may be inserted during relaxation but isn't a concern yet)

Fair enough. LGTM.

This revision is now accepted and ready to land.Sep 1 2016, 9:10 AM

r280595

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIInstrInfo.cpp

4 lines

SIInstructions.td

5 lines

SIRegisterInfo.td

7 lines

Diff 67576

lib/Target/AMDGPU/SIInstrInfo.cpp

	Show First 20 Lines • Show All 3,140 Lines • ▼ Show 20 Lines

	unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {			unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
	unsigned Opc = MI.getOpcode();			unsigned Opc = MI.getOpcode();
	const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);			const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
	unsigned DescSize = Desc.getSize();			unsigned DescSize = Desc.getSize();

	// If we have a definitive size, we can use it. Otherwise we need to inspect			// If we have a definitive size, we can use it. Otherwise we need to inspect
	// the operands to know the size.			// the operands to know the size.
	if (DescSize == 8 \|\| DescSize == 4)			if (DescSize != 0)
	return DescSize;			return DescSize;

	assert(DescSize == 0);

	// 4-byte instructions may have a 32-bit literal encoded after them. Check			// 4-byte instructions may have a 32-bit literal encoded after them. Check
	// operands that coud ever be literals.			// operands that coud ever be literals.
	if (isVALU(MI) \|\| isSALU(MI)) {			if (isVALU(MI) \|\| isSALU(MI)) {
	int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);			int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
	if (Src0Idx == -1)			if (Src0Idx == -1)
	return 4; // No operands.			return 4; // No operands.

	if (isLiteralConstant(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx)))			if (isLiteralConstant(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx)))
	▲ Show 20 Lines • Show All 54 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstructions.td

	Show First 20 Lines • Show All 1,942 Lines • ▼ Show 20 Lines
	multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {			multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
	let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in {			let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in {
	def _SAVE : PseudoInstSI <			def _SAVE : PseudoInstSI <
	(outs),			(outs),
	(ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,			(ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
	SReg_32:$scratch_offset, i32imm:$offset)> {			SReg_32:$scratch_offset, i32imm:$offset)> {
	let mayStore = 1;			let mayStore = 1;
	let mayLoad = 0;			let mayLoad = 0;
				// (2 * 4) + (8 * num_subregs) bytes maximum
				let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
	}			}

	def _RESTORE : PseudoInstSI <			def _RESTORE : PseudoInstSI <
	(outs vgpr_class:$dst),			(outs vgpr_class:$dst),
	(ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset,			(ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset,
	i32imm:$offset)> {			i32imm:$offset)> {
	let mayStore = 0;			let mayStore = 0;
	let mayLoad = 1;			let mayLoad = 1;

				// (2 * 4) + (8 * num_subregs) bytes maximum
				let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
				nhaehnleUnsubmitted Not Done Reply Inline Actions I just took a look, and for some reason the reloads tend to look like buffer_load_dword v3, off, s[72:75], s70 offset:1444 ; 16-byte Folded Reload ; encoding: [0xa4,0x05,0x30,0xe0,0x00,0x03,0x12,0x46] s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] buffer_load_dword v4, off, s[72:75], s70 offset:1448 ; 16-byte Folded Reload ; encoding: [0xa8,0x05,0x30,0xe0,0x00,0x04,0x12,0x46] s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] etc., so you actually get 12 bytes per dword. Not sure if that's a problem, especially since those waits are really wrong anyway (perhaps the wait insertion gets confused by the register/subregister relationship?). nhaehnle: I just took a look, and for some reason the reloads tend to look like…
				arsenmAuthorUnsubmitted Not Done Reply Inline Actions I'm not really sure what to do about waitcnts. It doesn't really matter for correctness, since the branch relax pass right now runs after these should be eliminated (these may be inserted during relaxation but isn't a concern yet) arsenm: I'm not really sure what to do about waitcnts. It doesn't really matter for correctness, since…
	}			}
	} // End UseNamedOperandTable = 1, VGPRSpill = 1			} // End UseNamedOperandTable = 1, VGPRSpill = 1
	}			}

	defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;			defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
	defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;			defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
	defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;			defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
	defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;			defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
	▲ Show 20 Lines • Show All 1,420 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIRegisterInfo.td

Show First 20 Lines • Show All 187 Lines • ▼ Show 20 Lines	def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
(add (decimate (shl TTMP_32, 1), 4)),		(add (decimate (shl TTMP_32, 1), 4)),
(add (decimate (shl TTMP_32, 2), 4)),		(add (decimate (shl TTMP_32, 2), 4)),
(add (decimate (shl TTMP_32, 3), 4))]>;		(add (decimate (shl TTMP_32, 3), 4))]>;

// VGPR 32-bit registers		// VGPR 32-bit registers
def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,		def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
(add (sequence "VGPR%u", 0, 255))> {		(add (sequence "VGPR%u", 0, 255))> {
let AllocationPriority = 1;		let AllocationPriority = 1;
		let Size = 32;
}		}

// VGPR 64-bit registers		// VGPR 64-bit registers
def VGPR_64 : RegisterTuples<[sub0, sub1],		def VGPR_64 : RegisterTuples<[sub0, sub1],
[(add (trunc VGPR_32, 255)),		[(add (trunc VGPR_32, 255)),
(add (shl VGPR_32, 1))]>;		(add (shl VGPR_32, 1))]>;

// VGPR 96-bit registers		// VGPR 96-bit registers
▲ Show 20 Lines • Show All 97 Lines • ▼ Show 20 Lines
def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {		def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
// Requires 8 s_mov_b64 to copy		// Requires 8 s_mov_b64 to copy
let CopyCost = 8;		let CopyCost = 8;
let AllocationPriority = 6;		let AllocationPriority = 6;
}		}

// Register class for all vector registers (VGPRs + Interploation Registers)		// Register class for all vector registers (VGPRs + Interploation Registers)
def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {		def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
		let Size = 64;

// Requires 2 v_mov_b32 to copy		// Requires 2 v_mov_b32 to copy
let CopyCost = 2;		let CopyCost = 2;
let AllocationPriority = 2;		let AllocationPriority = 2;
}		}

def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {		def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
let Size = 96;		let Size = 96;

// Requires 3 v_mov_b32 to copy		// Requires 3 v_mov_b32 to copy
let CopyCost = 3;		let CopyCost = 3;
let AllocationPriority = 3;		let AllocationPriority = 3;
}		}

def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {		def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {
		let Size = 128;

// Requires 4 v_mov_b32 to copy		// Requires 4 v_mov_b32 to copy
let CopyCost = 4;		let CopyCost = 4;
let AllocationPriority = 4;		let AllocationPriority = 4;
}		}

def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> {		def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> {
		let Size = 256;
let CopyCost = 8;		let CopyCost = 8;
let AllocationPriority = 5;		let AllocationPriority = 5;
}		}

def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {		def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
		let Size = 512;
let CopyCost = 16;		let CopyCost = 16;
let AllocationPriority = 6;		let AllocationPriority = 6;
}		}

def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {		def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
let Size = 32;		let Size = 32;
}		}

▲ Show 20 Lines • Show All 83 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Set sizes of spill pseudosClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 67576

lib/Target/AMDGPU/SIInstrInfo.cpp

lib/Target/AMDGPU/SIInstructions.td

lib/Target/AMDGPU/SIRegisterInfo.td

AMDGPU: Set sizes of spill pseudos
ClosedPublic