This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Fix computation for getOccupancyWithLocalMemSize
ClosedPublic

Authored by arsenm on Mar 2 2020, 11:35 AM.

Download Raw Diff

Details

Reviewers

rampitec
kzhuravl
t-tye

Summary

The computation here didn't really make sense to me, and reported
wildy different results depending on the flat work group size
attribute.

I think this should really report a range derived from the possible
work group size bounds, and only allow an occupancy that is a multiple
of the group size.

Diff Detail

Event Timeline

arsenm created this revision.Mar 2 2020, 11:35 AM

Herald added a project: Restricted Project. · View Herald TranscriptMar 2 2020, 11:35 AM

Herald added subscribers: llvm-commits, kerbowa, hiraditya and 6 others. · View Herald Transcript

rampitec added inline comments.Mar 2 2020, 12:00 PM

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
361	Yes, we cannot split a group. I think it is better to uncomment this.

arsenm marked an inline comment as done.Mar 2 2020, 12:28 PM

arsenm added inline comments.

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
361	The problem is this assumes the very worst case. This would hit 0 with the default / maximum group size, and need clamping to 1. This breaks every test. I was thinking I would try to apply this after changing this to report a range of occupancies

LGTM

This revision is now accepted and ready to land.Mar 2 2020, 12:36 PM

88aced1e454195e038560abb3a0732d020aa4295

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

AMDGPUSubtarget.cpp

41 lines

test/

CodeGen/

AMDGPU/

occupancy-levels.ll

89 lines

Diff 247697

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Show First 20 Lines • Show All 322 Lines • ▼ Show 20 Lines	unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;		unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);		unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
if (!WorkGroupsPerCu)		if (!WorkGroupsPerCu)
return 0;		return 0;
unsigned MaxWaves = getMaxWavesPerEU();		unsigned MaxWaves = getMaxWavesPerEU();
return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;		return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
}		}

		// FIXME: Should return min,max range.
unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,		unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
const Function &F) const {		const Function &F) const {
unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;		const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);		const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
if (!WorkGroupsPerCu)		if (!MaxWorkGroupsPerCu)
return 0;		return 0;
unsigned MaxWaves = getMaxWavesPerEU();
unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;		const unsigned WaveSize = getWavefrontSize();
unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
NumWaves = std::min(NumWaves, MaxWaves);		// FIXME: Do we need to account for alignment requirement of LDS rounding the
NumWaves = std::max(NumWaves, 1u);		// size up?
return NumWaves;		// Compute restriction based on LDS usage
		unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);

		// This can be queried with more LDS than is possible, so just assume the
		// worst.
		if (NumGroups == 0)
		return 1;

		NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);

		// Round to the number of waves.
		const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
		unsigned MaxWaves = NumGroups * MaxGroupNumWaves;

		// Clamp to the maximum possible number of waves.
		MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());

		// FIXME: Needs to be a multiple of the group size?
		//MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
		rampitecUnsubmitted Not Done Reply Inline Actions Yes, we cannot split a group. I think it is better to uncomment this. rampitec: Yes, we cannot split a group. I think it is better to uncomment this.
		arsenmAuthorUnsubmitted Done Reply Inline Actions The problem is this assumes the very worst case. This would hit 0 with the default / maximum group size, and need clamping to 1. This breaks every test. I was thinking I would try to apply this after changing this to report a range of occupancies arsenm: The problem is this assumes the very worst case. This would hit 0 with the default / maximum…

		assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
		"computed invalid occupancy");
		return MaxWaves;
}		}

unsigned		unsigned
AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {		AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();		const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());		return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
}		}

▲ Show 20 Lines • Show All 551 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/occupancy-levels.ll

	Show First 20 Lines • Show All 277 Lines • ▼ Show 20 Lines
	; GFX1010W32: ; Occupancy: 20			; GFX1010W32: ; Occupancy: 20
	@lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4			@lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4
	define amdgpu_kernel void @used_lds_13112() {			define amdgpu_kernel void @used_lds_13112() {
	%p = bitcast [13112 x i8] addrspace(3)* @lds13112 to i8 addrspace(3)*			%p = bitcast [13112 x i8] addrspace(3)* @lds13112 to i8 addrspace(3)*
	store volatile i8 1, i8 addrspace(3)* %p			store volatile i8 1, i8 addrspace(3)* %p
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}used_lds_8252_max_group_size_64:
				; GFX9: ; Occupancy: 7{{$}}
				; GFX101064: ; Occupancy: 7{{$}}
				; GFX1010W32: ; Occupancy: 14{{$}}
				@lds8252 = internal addrspace(3) global [8252 x i8] undef, align 4
				define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 {
				%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
				store volatile i8 1, i8 addrspace(3)* %p
				ret void
				}

				; GCN-LABEL: {{^}}used_lds_8252_max_group_size_96:
				; GFX9: ; Occupancy: 10{{$}}
				; GFX1010W64: ; Occupancy: 14{{$}}
				; GFX1010W32: ; Occupancy: 20{{$}}
				define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 {
				%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
				store volatile i8 1, i8 addrspace(3)* %p
				ret void
				}

				; GCN-LABEL: {{^}}used_lds_8252_max_group_size_128:
				; GFX9: ; Occupancy: 10{{$}}
				; GFX1010W64: ; Occupancy: 14{{$}}
				; GFX1010W32: ; Occupancy: 20{{$}}
				define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 {
				%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
				store volatile i8 1, i8 addrspace(3)* %p
				ret void
				}

				; GCN-LABEL: {{^}}used_lds_8252_max_group_size_192:
				; GFX9: ; Occupancy: 10{{$}}
				; GFX1010W64: ; Occupancy: 20{{$}}
				; GFX1010W32: ; Occupancy: 20{{$}}
				define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 {
				%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
				store volatile i8 1, i8 addrspace(3)* %p
				ret void
				}

				; GCN-LABEL: {{^}}used_lds_8252_max_group_size_256:
				; GFX9: ; Occupancy: 10{{$}}
				; GFX1010W64: ; Occupancy: 20{{$}}
				; GFX1010W32: ; Occupancy: 20{{$}}
				define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 {
				%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
				store volatile i8 1, i8 addrspace(3)* %p
				ret void
				}

				; GCN-LABEL: {{^}}used_lds_8252_max_group_size_512:
				; GFX9: ; Occupancy: 10{{$}}
				; GFX1010W64: ; Occupancy: 20{{$}}
				; GFX1010W32: ; Occupancy: 20{{$}}
				define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 {
				%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
				store volatile i8 1, i8 addrspace(3)* %p
				ret void
				}

				; GCN-LABEL: {{^}}used_lds_8252_max_group_size_1024:
				; GFX9: ; Occupancy: 10{{$}}
				; GFX1010W64: ; Occupancy: 20{{$}}
				; GFX1010W32: ; Occupancy: 20{{$}}
				define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
				%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
				store volatile i8 1, i8 addrspace(3)* %p
				ret void
				}

				; GCN-LABEL: {{^}}used_lds_8252_max_group_size_32:
				; GFX9: ; Occupancy: 7{{$}}
				; GFX1010W64: ; Occupancy: 7{{$}}
				; GFX1010W32: ; Occupancy: 7{{$}}
				define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
				%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
				store volatile i8 1, i8 addrspace(3)* %p
				ret void
				}

	attributes #0 = { "amdgpu-waves-per-eu"="2,3" }			attributes #0 = { "amdgpu-waves-per-eu"="2,3" }
	attributes #1 = { "amdgpu-waves-per-eu"="18,18" }			attributes #1 = { "amdgpu-waves-per-eu"="18,18" }
	attributes #2 = { "amdgpu-waves-per-eu"="19,19" }			attributes #2 = { "amdgpu-waves-per-eu"="19,19" }
				attributes #3 = { "amdgpu-flat-work-group-size"="1,64" }
				attributes #4 = { "amdgpu-flat-work-group-size"="1,96" }
				attributes #5 = { "amdgpu-flat-work-group-size"="1,128" }
				attributes #6 = { "amdgpu-flat-work-group-size"="1,192" }
				attributes #7 = { "amdgpu-flat-work-group-size"="1,256" }
				attributes #8 = { "amdgpu-flat-work-group-size"="1,512" }
				attributes #9 = { "amdgpu-flat-work-group-size"="1,1024" }
				attributes #10 = { "amdgpu-flat-work-group-size"="1,32" }