This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU/SI: Limit load clustering to 16 bytes instead of 4 instructions
ClosedPublic

Authored by • tstellarAMD on Mar 24 2016, 9:34 AM.

Download Raw Diff

Details

Reviewers

nhaehnle
arsenm

Commits

rGa76bcc2ea147: AMDGPU/SI: Limit load clustering to 16 bytes instead of 4 instructions
rL264589: AMDGPU/SI: Limit load clustering to 16 bytes instead of 4 instructions

Summary

This helps prevent load clustering from drastically increasing register
pressure by trying to cluster 4 SMRDx8 loads together. The limit of 16
bytes was chosen, because it seems like that was the original intent
of setting the limit to 4 instructions, but more analysis could show
that a different limit is better.

This fixes yields small decreases in register usage with shader-db, but
also helps avoid a large increase in register usage when lane mask
tracking is enabled in the machine scheduler, because lane mask tracking
enables more opportunities for load clustering.

shader-db stats:

2379 shaders in 477 tests
Totals:
SGPRS: 49744 -> 48600 (-2.30 %)
VGPRS: 34120 -> 34076 (-0.13 %)
Code Size: 1282888 -> 1283184 (0.02 %) bytes
LDS: 28 -> 28 (0.00 %) blocks
Scratch: 495616 -> 492544 (-0.62 %) bytes per wave
Max Waves: 6843 -> 6853 (0.15 %)
Wait states: 0 -> 0 (0.00 %)

Diff Detail

Repository: rL LLVM

Event Timeline

• tstellarAMD updated this revision to Diff 51565.Mar 24 2016, 9:34 AM

• tstellarAMD retitled this revision from to AMDGPU/SI: Limit load clustering to 16 bytes instead of 4 instructions.

• tstellarAMD updated this object.

• tstellarAMD added reviewers: nhaehnle, arsenm.

• tstellarAMD added a subscriber: llvm-commits.

Herald added a subscriber: arsenm. · View Herald TranscriptMar 24 2016, 9:34 AM

• tstellarAMD added a child revision: D18452: AMDGPU/SI: Enable lanemask tracking in misched.Mar 24 2016, 9:35 AM

LGTM

This revision is now accepted and ready to land.Mar 24 2016, 9:36 AM

• tstellarAMD added a child revision: D18453: AMDGPU/SI: Improve MachineSchedModel definition.Mar 24 2016, 9:37 AM

Closed by commit rL264589: AMDGPU/SI: Limit load clustering to 16 bytes instead of 4 instructions (authored by tstellar). · Explain WhyMar 28 2016, 9:15 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

AMDGPU/

SIInstrInfo.cpp

41 lines

test/

CodeGen/

AMDGPU/

ctpop.ll

2 lines

madak.ll

2 lines

schedule-kernel-arg-loads.ll

14 lines

Diff 51798

llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 289 Lines • ▼ Show 20 Lines	bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
}		}

return false;		return false;
}		}

bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,		bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
MachineInstr *SecondLdSt,		MachineInstr *SecondLdSt,
unsigned NumLoads) const {		unsigned NumLoads) const {
// TODO: This needs finer tuning		const MachineOperand *FirstDst = nullptr;
if (NumLoads > 4)		const MachineOperand *SecondDst = nullptr;

		if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
		FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdst);
		SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdst);
		}

		if (isSMRD(FirstLdSt) && isSMRD(FirstLdSt)) {
		FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::sdst);
		SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::sdst);
		}

		if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) \|\|
		(isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) {
		FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdata);
		SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdata);
		}

		if (!FirstDst \|\| !SecondDst)
return false;		return false;

if (isDS(FirstLdSt) && isDS(SecondLdSt))		// Try to limit clustering based on the total number of bytes loaded
return true;		// rather than the number of instructions. This is done to help reduce
		// register pressure. The method used is somewhat inexact, though,
		// because it assumes that all loads in the cluster will load the
		// same number of bytes as FirstLdSt.

if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt))		// The unit of this value is bytes.
return true;		// FIXME: This needs finer tuning.
		unsigned LoadClusterThreshold = 16;

		const MachineRegisterInfo &MRI =
		FirstLdSt->getParent()->getParent()->getRegInfo();
		const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());

return (isMUBUF(FirstLdSt) \|\| isMTBUF(FirstLdSt)) &&		return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;
(isMUBUF(SecondLdSt) \|\| isMTBUF(SecondLdSt));
}		}

void		void
SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,		SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, DebugLoc DL,		MachineBasicBlock::iterator MI, DebugLoc DL,
unsigned DestReg, unsigned SrcReg,		unsigned DestReg, unsigned SrcReg,
bool KillSrc) const {		bool KillSrc) const {

▲ Show 20 Lines • Show All 2,614 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/ctpop.ll

Show First 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
%ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone		%ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
%add = add i32 %ctpop0, %ctpop1		%add = add i32 %ctpop0, %ctpop1
store i32 %add, i32 addrspace(1)* %out, align 4		store i32 %add, i32 addrspace(1)* %out, align 4
ret void		ret void
}		}

; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:		; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
; GCN: buffer_load_dword [[VAL0:v[0-9]+]],		; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
; GCN-NEXT: s_waitcnt		; GCN: s_waitcnt
; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}		; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
; GCN-NEXT: buffer_store_dword [[RESULT]],		; GCN-NEXT: buffer_store_dword [[RESULT]],
; GCN: s_endpgm		; GCN: s_endpgm
define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {		define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
%val0 = load i32, i32 addrspace(1)* %in0, align 4		%val0 = load i32, i32 addrspace(1)* %in0, align 4
%ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone		%ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
%add = add i32 %ctpop0, %sval		%add = add i32 %ctpop0, %sval
store i32 %add, i32 addrspace(1)* %out, align 4		store i32 %add, i32 addrspace(1)* %out, align 4
▲ Show 20 Lines • Show All 229 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/madak.ll

Show First 20 Lines • Show All 95 Lines • ▼ Show 20 Lines	define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%mul = fmul float %a, %b		%mul = fmul float %a, %b
%madak = fadd float %mul, 4.0		%madak = fadd float %mul, 4.0
store float %madak, float addrspace(1)* %out.gep, align 4		store float %madak, float addrspace(1)* %out.gep, align 4
ret void		ret void
}		}

; We can't use an SGPR when forming madak		; We can't use an SGPR when forming madak
; GCN-LABEL: {{^}}s_v_madak_f32:		; GCN-LABEL: {{^}}s_v_madak_f32:
; GCN: s_load_dword [[SB:s[0-9]+]]		; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000		; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]		; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
; GCN-NOT: v_madak_f32		; GCN-NOT: v_madak_f32
; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]		; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {		define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone		%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid		%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid		%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
▲ Show 20 Lines • Show All 81 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll

	; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s \| FileCheck -check-prefix=FUNC -check-prefix=SI --check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s \| FileCheck -check-prefix=FUNC -check-prefix=SI --check-prefix=GCN %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s

	; FUNC-LABEL: {{^}}cluster_arg_loads:			; FUNC-LABEL: {{^}}cluster_arg_loads:
				; FIXME: Due to changes in the load clustering heuristics. We now longer
				; cluster all argument loads together.
				; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
				; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
	; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9			; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
	; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb			; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
	; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd			; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
	; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
	; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
	; VI-NEXT: s_nop 0			; VI-NEXT: s_nop 0
	; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c			; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
	; VI-NEXT: s_nop 0			; VI-NEXT: s_nop 0
	; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34			; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
	; VI-NEXT: s_nop 0			; VI-NEXT: s_nop 0
	; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38			; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
	define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {			define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
	store i32 %x, i32 addrspace(1)* %out0, align 4			store i32 %x, i32 addrspace(1)* %out0, align 4
	store i32 %y, i32 addrspace(1)* %out1, align 4			store i32 %y, i32 addrspace(1)* %out1, align 4
	ret void			ret void
	}			}

	; Test for a crash in SIInstrInfo::areLoadsFromSameBasePtr() when			; Test for a crash in SIInstrInfo::areLoadsFromSameBasePtr() when
	; s_load_dwordx2 has a register offset			; s_load_dwordx2 has a register offset
	Show All 28 Lines