This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU/SI: Fold operands with sub-registers
ClosedPublic

Authored by nhaehnle on Jan 4 2016, 3:00 PM.

Download Raw Diff

Details

Reviewers

• tstellarAMD
arsenm
mareko

Commits

rG82fc962c2018: AMDGPU/SI: Fold operands with sub-registers
rL257074: AMDGPU/SI: Fold operands with sub-registers

Summary

Multi-dword constant loads generated unnecessary moves from SGPRs into VGPRs,
increasing the code size and VGPR pressure. These moves are now folded away.

Note that this lack of operand folding was not a problem for VMEM loads,
because COPY nodes from VReg_Nnn to VGPR32 are eliminated by the register
coalescer.

Some tests are updated, note that the fsub.ll test explicitly checks that
the move is elided.

With the IR generated by current Mesa, the changes are obviously relatively
minor:

7063 shaders in 3531 tests
Totals:
SGPRS: 351872 -> 352560 (0.20 %)
VGPRS: 199984 -> 200732 (0.37 %)
Code Size: 9876968 -> 9881112 (0.04 %) bytes
LDS: 91 -> 91 (0.00 %) blocks
Scratch: 1779712 -> 1767424 (-0.69 %) bytes per wave
Wait states: 295164 -> 295337 (0.06 %)

Totals from affected shaders:
SGPRS: 65784 -> 66472 (1.05 %)
VGPRS: 38064 -> 38812 (1.97 %)
Code Size: 1993828 -> 1997972 (0.21 %) bytes
LDS: 42 -> 42 (0.00 %) blocks
Scratch: 795648 -> 783360 (-1.54 %) bytes per wave
Wait states: 54026 -> 54199 (0.32 %)

Diff Detail

Repository: rL LLVM

Event Timeline

nhaehnle updated this revision to Diff 43929.Jan 4 2016, 3:00 PM

nhaehnle retitled this revision from to AMDGPU/SI: Fold operands with sub-registers.

nhaehnle updated this object.

nhaehnle added reviewers: • tstellarAMD, arsenm, mareko.

nhaehnle added a subscriber: llvm-commits.

Herald added a subscriber: arsenm. · View Herald TranscriptJan 4 2016, 3:00 PM

LGTM.

This revision is now accepted and ready to land.Jan 6 2016, 8:14 PM

Closed by commit rL257074: AMDGPU/SI: Fold operands with sub-registers (authored by nha). · Explain WhyJan 7 2016, 9:14 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

AMDGPU/

2 lines

5 lines

4 lines

34 lines

test/

CodeGen/

AMDGPU/

fmin_legacy.ll

4 lines

fsub.ll

15 lines

llvm.round.f64.ll

2 lines

Diff 44223

llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Show First 20 Lines • Show All 209 Lines • ▼ Show 20 Lines	static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
// =>		// =>
// VGPRx = COPY SGPRx		// VGPRx = COPY SGPRx
// VGPRz = REG_SEQUENCE VGPRx, sub0		// VGPRz = REG_SEQUENCE VGPRx, sub0

MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());		MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());

for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {		for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
unsigned SrcReg = MI.getOperand(I).getReg();		unsigned SrcReg = MI.getOperand(I).getReg();
unsigned SrcSubReg = MI.getOperand(I).getReg();		unsigned SrcSubReg = MI.getOperand(I).getSubReg();

const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);		const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
assert(TRI->isSGPRClass(SrcRC) &&		assert(TRI->isSGPRClass(SrcRC) &&
"Expected SGPR REG_SEQUENCE to only have SGPR inputs");		"Expected SGPR REG_SEQUENCE to only have SGPR inputs");

SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);		SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);		const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);

▲ Show 20 Lines • Show All 147 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp

Show First 20 Lines • Show All 328 Lines • ▼ Show 20 Lines	for (I = MBB.begin(); I != MBB.end(); I = Next) {

// Folding immediates with more than one use will increase program size.		// Folding immediates with more than one use will increase program size.
// FIXME: This will also reduce register usage, which may be better		// FIXME: This will also reduce register usage, which may be better
// in some cases. A better heuristic is needed.		// in some cases. A better heuristic is needed.
if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) &&		if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) &&
!MRI.hasOneUse(MI.getOperand(0).getReg()))		!MRI.hasOneUse(MI.getOperand(0).getReg()))
continue;		continue;

// FIXME: Fold operands with subregs.
if (OpToFold.isReg() &&		if (OpToFold.isReg() &&
(!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) \|\|		!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
OpToFold.getSubReg()))
continue;		continue;


// We need mutate the operands of new mov instructions to add implicit		// We need mutate the operands of new mov instructions to add implicit
// uses of EXEC, but adding them invalidates the use_iterator, so defer		// uses of EXEC, but adding them invalidates the use_iterator, so defer
// this.		// this.
SmallVector<MachineInstr *, 4> CopiesToReplace;		SmallVector<MachineInstr *, 4> CopiesToReplace;

std::vector<FoldCandidate> FoldList;		std::vector<FoldCandidate> FoldList;
for (MachineRegisterInfo::use_iterator		for (MachineRegisterInfo::use_iterator
Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();		Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
Show All 30 Lines

llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 1,771 Lines • ▼ Show 20 Lines	if (!MO.isReg())
return false;		return false;

unsigned Reg = MO.getReg();		unsigned Reg = MO.getReg();
const TargetRegisterClass *RC =		const TargetRegisterClass *RC =
TargetRegisterInfo::isVirtualRegister(Reg) ?		TargetRegisterInfo::isVirtualRegister(Reg) ?
MRI.getRegClass(Reg) :		MRI.getRegClass(Reg) :
RI.getPhysRegClass(Reg);		RI.getPhysRegClass(Reg);

		const SIRegisterInfo *TRI =
		static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
		RC = TRI->getSubRegClass(RC, MO.getSubReg());

// In order to be legal, the common sub-class must be equal to the		// In order to be legal, the common sub-class must be equal to the
// class of the current operand. For example:		// class of the current operand. For example:
//		//
// v_mov_b32 s0 ; Operand defined as vsrc_32		// v_mov_b32 s0 ; Operand defined as vsrc_32
// ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL		// ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
//		//
// s_sendmsg 0, s0 ; Operand defined as m0reg		// s_sendmsg 0, s0 ; Operand defined as m0reg
// ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL		// ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
▲ Show 20 Lines • Show All 1,290 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp

Show First 20 Lines • Show All 458 Lines • ▼ Show 20 Lines	const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
}		}
}		}

const TargetRegisterClass *SIRegisterInfo::getSubRegClass(		const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
const TargetRegisterClass *RC, unsigned SubIdx) const {		const TargetRegisterClass *RC, unsigned SubIdx) const {
if (SubIdx == AMDGPU::NoSubRegister)		if (SubIdx == AMDGPU::NoSubRegister)
return RC;		return RC;

// If this register has a sub-register, we can safely assume it is a 32-bit		// We can assume that each lane corresponds to one 32-bit register.
// register, because all of SI's sub-registers are 32-bit.		unsigned Count = countPopulation(getSubRegIndexLaneMask(SubIdx));
if (isSGPRClass(RC)) {		if (isSGPRClass(RC)) {
		switch (Count) {
		case 1:
return &AMDGPU::SGPR_32RegClass;		return &AMDGPU::SGPR_32RegClass;
		case 2:
		return &AMDGPU::SReg_64RegClass;
		case 4:
		return &AMDGPU::SReg_128RegClass;
		case 8:
		return &AMDGPU::SReg_256RegClass;
		case 16: /* fall-through */
		default:
		llvm_unreachable("Invalid sub-register class size");
		}
} else {		} else {
		switch (Count) {
		case 1:
return &AMDGPU::VGPR_32RegClass;		return &AMDGPU::VGPR_32RegClass;
		case 2:
		return &AMDGPU::VReg_64RegClass;
		case 3:
		return &AMDGPU::VReg_96RegClass;
		case 4:
		return &AMDGPU::VReg_128RegClass;
		case 8:
		return &AMDGPU::VReg_256RegClass;
		case 16: /* fall-through */
		default:
		llvm_unreachable("Invalid sub-register class size");
		}
}		}
}		}

bool SIRegisterInfo::shouldRewriteCopySrc(		bool SIRegisterInfo::shouldRewriteCopySrc(
const TargetRegisterClass *DefRC,		const TargetRegisterClass *DefRC,
unsigned DefSubReg,		unsigned DefSubReg,
const TargetRegisterClass *SrcRC,		const TargetRegisterClass *SrcRC,
unsigned SrcSubReg) const {		unsigned SrcSubReg) const {
▲ Show 20 Lines • Show All 172 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.ll

	; RUN: llc -march=amdgcn -mcpu=SI < %s \| FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -mcpu=SI < %s \| FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
	; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s \| FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s			; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s \| FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
	; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s			; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s

	; FIXME: Should replace unsafe-fp-math with no signed zeros.			; FIXME: Should replace unsafe-fp-math with no signed zeros.

	declare i32 @llvm.r600.read.tidig.x() #1			declare i32 @llvm.r600.read.tidig.x() #1

	; FUNC-LABEL: @test_fmin_legacy_f32			; FUNC-LABEL: @test_fmin_legacy_f32
	; EG: MIN *			; EG: MIN *
	; SI-SAFE: v_min_legacy_f32_e32			; SI-SAFE: v_min_legacy_f32_e64
	; SI-NONAN: v_min_f32_e32			; SI-NONAN: v_min_f32_e64
	define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {			define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
	%r0 = extractelement <4 x float> %reg0, i32 0			%r0 = extractelement <4 x float> %reg0, i32 0
	%r1 = extractelement <4 x float> %reg0, i32 1			%r1 = extractelement <4 x float> %reg0, i32 1
	%r2 = fcmp uge float %r0, %r1			%r2 = fcmp uge float %r0, %r1
	%r3 = select i1 %r2, float %r1, float %r0			%r3 = select i1 %r2, float %r1, float %r0
	%vec = insertelement <4 x float> undef, float %r3, i32 0			%vec = insertelement <4 x float> undef, float %r3, i32 0
	store <4 x float> %vec, <4 x float> addrspace(1)* %out, align 16			store <4 x float> %vec, <4 x float> addrspace(1)* %out, align 16
	ret void			ret void
	▲ Show 20 Lines • Show All 166 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/fsub.ll

Show All 26 Lines
declare float @llvm.R600.load.input(i32) readnone		declare float @llvm.R600.load.input(i32) readnone

declare void @llvm.AMDGPU.store.output(float, i32)		declare void @llvm.AMDGPU.store.output(float, i32)

; FUNC-LABEL: {{^}}fsub_v2f32:		; FUNC-LABEL: {{^}}fsub_v2f32:
; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z		; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y		; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y

; FIXME: Should be using SGPR directly for first operand		; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}		; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {		define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
%sub = fsub <2 x float> %a, %b		%sub = fsub <2 x float> %a, %b
store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8		store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8
ret void		ret void
}		}

; FUNC-LABEL: {{^}}v_fsub_v4f32:		; FUNC-LABEL: {{^}}v_fsub_v4f32:
; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}		; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
Show All 9 Lines	define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1		%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
%a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16		%a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
%b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16		%b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
%result = fsub <4 x float> %a, %b		%result = fsub <4 x float> %a, %b
store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16		store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
ret void		ret void
}		}

; FIXME: Should be using SGPR directly for first operand

; FUNC-LABEL: {{^}}s_fsub_v4f32:		; FUNC-LABEL: {{^}}s_fsub_v4f32:
; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}		; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}		; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}		; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}		; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; SI: s_endpgm		; SI: s_endpgm
define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {		define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
%result = fsub <4 x float> %a, %b		%result = fsub <4 x float> %a, %b
store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16		store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
ret void		ret void
}		}

llvm/trunk/test/CodeGen/AMDGPU/llvm.round.f64.ll

	Show All 15 Lines
	; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11			; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11

	; SI-DAG: v_not_b32_e32			; SI-DAG: v_not_b32_e32
	; SI-DAG: v_not_b32_e32			; SI-DAG: v_not_b32_e32

	; SI-DAG: v_cmp_eq_i32			; SI-DAG: v_cmp_eq_i32

	; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff			; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff
	; SI-DAG: v_cmp_gt_i32_e32			; SI-DAG: v_cmp_gt_i32
	; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]]			; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]]

	; SI: buffer_store_dwordx2			; SI: buffer_store_dwordx2
	; SI: s_endpgm			; SI: s_endpgm
	define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {			define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
	%tid = call i32 @llvm.r600.read.tidig.x() #1			%tid = call i32 @llvm.r600.read.tidig.x() #1
	%gep = getelementptr double, double addrspace(1)* %in, i32 %tid			%gep = getelementptr double, double addrspace(1)* %in, i32 %tid
	%out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid			%out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
	Show All 39 Lines