This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: More bits of frame index are known to be zero
ClosedPublic

Authored by arsenm on Feb 22 2016, 6:15 PM.

Download Raw Diff

Details

Reviewers

Summary

The maximum private allocation for the whole GPU is 4G,
so the maximum possible index for a single workitem is the
maximum size divided by the smallest granularity for a dispatch.

This increases the number of known zero high bits, which
enables more offset folding. The maximum private size per
workitem with this is 128M but may be smaller still.

Diff Detail

Event Timeline

arsenm updated this revision to Diff 48763.Feb 22 2016, 6:15 PM

arsenm retitled this revision from to AMDGPU: More bits of frame index are known to be zero.

arsenm updated this object.

arsenm added a reviewer: • tstellarAMD.

arsenm added a subscriber: llvm-commits.

Herald added a subscriber: arsenm. · View Herald TranscriptFeb 22 2016, 6:15 PM

LGTM.

This revision is now accepted and ready to land.Feb 24 2016, 7:05 PM

r262153

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

8 lines

5 lines

2 lines

40 lines

test/

CodeGen/

AMDGPU/

private-element-size.ll

18 lines

scratch-buffer.ll

9 lines

Diff 48763

lib/Target/AMDGPU/AMDGPU.td

Show First 20 Lines • Show All 180 Lines • ▼ Show 20 Lines	class FeatureMaxPrivateElementSize<int size> : SubtargetFeature<
!cast<string>(size),		!cast<string>(size),
"Maximum private access size may be "#size		"Maximum private access size may be "#size
>;		>;

def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;		def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;		def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;		def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;


def FeatureEnableHugeScratchBuffer : SubtargetFeature<
"huge-scratch-buffer",
"EnableHugeScratchBuffer",
"true",
"Enable scratch buffer sizes greater than 128 GB"
>;

def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",		def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
"EnableVGPRSpilling",		"EnableVGPRSpilling",
"true",		"true",
"Enable spilling of VGPRs to scratch memory"		"Enable spilling of VGPRs to scratch memory"
>;		>;

def FeatureDumpCode : SubtargetFeature <"DumpCode",		def FeatureDumpCode : SubtargetFeature <"DumpCode",
"DumpCode",		"DumpCode",
▲ Show 20 Lines • Show All 175 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUSubtarget.h

Show First 20 Lines • Show All 85 Lines • ▼ Show 20 Lines	private:
bool SGPRInitBug;		bool SGPRInitBug;
bool IsGCN;		bool IsGCN;
bool GCN1Encoding;		bool GCN1Encoding;
bool GCN3Encoding;		bool GCN3Encoding;
bool CIInsts;		bool CIInsts;
bool FeatureDisable;		bool FeatureDisable;
int LDSBankCount;		int LDSBankCount;
unsigned IsaVersion;		unsigned IsaVersion;
bool EnableHugeScratchBuffer;
bool EnableSIScheduler;		bool EnableSIScheduler;

std::unique_ptr<AMDGPUFrameLowering> FrameLowering;		std::unique_ptr<AMDGPUFrameLowering> FrameLowering;
std::unique_ptr<AMDGPUTargetLowering> TLInfo;		std::unique_ptr<AMDGPUTargetLowering> TLInfo;
std::unique_ptr<AMDGPUInstrInfo> InstrInfo;		std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
InstrItineraryData InstrItins;		InstrItineraryData InstrItins;
Triple TargetTriple;		Triple TargetTriple;

▲ Show 20 Lines • Show All 175 Lines • ▼ Show 20 Lines	void overrideSchedPolicy(MachineSchedPolicy &Policy,
MachineInstr begin, MachineInstr end,		MachineInstr begin, MachineInstr end,
unsigned NumRegionInstrs) const override;		unsigned NumRegionInstrs) const override;

// Helper functions to simplify if statements		// Helper functions to simplify if statements
bool isTargetELF() const {		bool isTargetELF() const {
return false;		return false;
}		}

bool enableHugeScratchBuffer() const {
return EnableHugeScratchBuffer;
}

bool enableSIScheduler() const {		bool enableSIScheduler() const {
return EnableSIScheduler;		return EnableSIScheduler;
}		}

bool dumpCode() const {		bool dumpCode() const {
return DumpCode;		return DumpCode;
}		}
bool r600ALUEncoding() const {		bool r600ALUEncoding() const {
Show All 37 Lines

lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Show First 20 Lines • Show All 76 Lines • ▼ Show 20 Lines	: AMDGPUGenSubtargetInfo(TT, GPU, FS),
EnablePromoteAlloca(false),		EnablePromoteAlloca(false),
EnableIfCvt(true), EnableLoadStoreOpt(false),		EnableIfCvt(true), EnableLoadStoreOpt(false),
EnableUnsafeDSOffsetFolding(false),		EnableUnsafeDSOffsetFolding(false),
EnableXNACK(false),		EnableXNACK(false),
WavefrontSize(0), CFALUBug(false),		WavefrontSize(0), CFALUBug(false),
LocalMemorySize(0), MaxPrivateElementSize(0),		LocalMemorySize(0), MaxPrivateElementSize(0),
EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),		EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),		GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),		IsaVersion(ISAVersion0_0_0),
EnableSIScheduler(false), FrameLowering(nullptr),		EnableSIScheduler(false), FrameLowering(nullptr),
InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {		InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {

initializeSubtargetDependencies(TT, GPU, FS);		initializeSubtargetDependencies(TT, GPU, FS);

const unsigned MaxStackAlign = 64 * 16; // Maximum stack alignment (long16)		const unsigned MaxStackAlign = 64 * 16; // Maximum stack alignment (long16)

if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {		if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
▲ Show 20 Lines • Show All 66 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIISelLowering.cpp

	Show First 20 Lines • Show All 1,169 Lines • ▼ Show 20 Lines
	}			}

	SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {			SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {

	SDLoc SL(Op);			SDLoc SL(Op);
	FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);			FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
	unsigned FrameIndex = FINode->getIndex();			unsigned FrameIndex = FINode->getIndex();

	// A FrameIndex node represents a 32-bit offset into scratch memory. If			// A FrameIndex node represents a 32-bit offset into scratch memory. If the
	// the high bit of a frame index offset were to be set, this would mean			// high bit of a frame index offset were to be set, this would mean that it
	// that it represented an offset of ~2GB * 64 = ~128GB from the start of the			// represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
	// scratch buffer, with 64 being the number of threads per wave.			// buffer, with 64 being the number of threads per wave.
	//			//
	// If we know the machine uses less than 128GB of scratch, then we can			// The maximum private allocation for the entire GPU is 4G, and we are
	// amrk the high bit of the FrameIndex node as known zero,			// concerned with the largest the index could ever be for an individual
	// which is important, because it means in most situations we can			// workitem. This will occur with the minmum dispatch size. If a program
	// prove that values derived from FrameIndex nodes are non-negative.			// requires more, the dispatch size will be reduced.
	// This enables us to take advantage of more addressing modes when			//
	// accessing scratch buffers, since for scratch reads/writes, the register			// With this limit, we can mark the high bit of the FrameIndex node as known
	// offset must always be positive.			// zero, which is important, because it means in most situations we can prove
				// that values derived from FrameIndex nodes are non-negative. This enables us
				// to take advantage of more addressing modes when accessing scratch buffers,
				// since for scratch reads/writes, the register offset must always be
				// positive.

				uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;

				// XXX - It is unclear if partial dispatch works. Assume it works at half wave
				// granularity. It is probably a full wave.
				uint64_t MinGranularity = 32;

	SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);			unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
	if (Subtarget->enableHugeScratchBuffer())			EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
	return TFI;

				SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
	return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,			return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
	DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));			DAG.getValueType(ExtVT));
	}			}

	bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {			bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
	if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN)			if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
	return false;			return false;

	switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {			switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
	default: return false;			default: return false;
	▲ Show 20 Lines • Show All 1,783 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/private-element-size.ll

	Show All 27 Lines
	; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}			; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
	; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}			; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
	; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}			; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
	; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}			; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
	; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}			; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
	; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}			; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}

	; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}			; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
	; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}			; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
	; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}			; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
	; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}			; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
	define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {			define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
	entry:			entry:
	%tid = call i32 @llvm.amdgcn.workitem.id.x()			%tid = call i32 @llvm.amdgcn.workitem.id.x()
	%idxprom = sext i32 %tid to i64			%idxprom = sext i32 %tid to i64
	%gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom			%gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
	%index.load = load i32, i32 addrspace(1)* %gep.index			%index.load = load i32, i32 addrspace(1)* %gep.index
	%index = and i32 %index.load, 2			%index = and i32 %index.load, 2
	%alloca = alloca [2 x <4 x i32>], align 16			%alloca = alloca [2 x <4 x i32>], align 16
	▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines
	; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:36{{$}}			; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:36{{$}}
	; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:40{{$}}			; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:40{{$}}
	; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:44{{$}}			; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:44{{$}}
	; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:48{{$}}			; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:48{{$}}
	; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:52{{$}}			; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:52{{$}}
	; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56{{$}}			; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56{{$}}
	; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:60{{$}}			; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:60{{$}}

	; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}			; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
	; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}			; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
	; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}			; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
	; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}			; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
				; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
				; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
				; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
				; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
	define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {			define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
	entry:			entry:
	%tid = call i32 @llvm.amdgcn.workitem.id.x()			%tid = call i32 @llvm.amdgcn.workitem.id.x()
	%idxprom = sext i32 %tid to i64			%idxprom = sext i32 %tid to i64
	%gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom			%gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
	%index.load = load i32, i32 addrspace(1)* %gep.index			%index.load = load i32, i32 addrspace(1)* %gep.index
	%index = and i32 %index.load, 2			%index = and i32 %index.load, 2
	%alloca = alloca [2 x <8 x i32>], align 16			%alloca = alloca [2 x <8 x i32>], align 16
	Show All 15 Lines

test/CodeGen/AMDGPU/scratch-buffer.ll

; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s \| FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s		; RUN: llc -verify-machineinstrs -march=amdgcn < %s \| FileCheck -check-prefix=GCN %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s \| FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s		; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s \| FileCheck -check-prefix=GCN %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=SI < %s \| FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=tonga < %s \| FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s

; When a frame index offset is more than 12-bits, make sure we don't store		; When a frame index offset is more than 12-bits, make sure we don't store
; it in mubuf's offset field.		; it in mubuf's offset field.

; Also, make sure we use the same register for storing the scratch buffer addresss		; Also, make sure we use the same register for storing the scratch buffer addresss
; for both stores. This register is allocated by the register scavenger, so we		; for both stores. This register is allocated by the register scavenger, so we
; should be able to reuse the same regiser for each scratch buffer access.		; should be able to reuse the same regiser for each scratch buffer access.

▲ Show 20 Lines • Show All 84 Lines • ▼ Show 20 Lines	entry:
%array = alloca [8192 x i32]		%array = alloca [8192 x i32]
%ptr_offset = add i32 %offset, 4		%ptr_offset = add i32 %offset, 4
%ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 %ptr_offset		%ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 %ptr_offset
store i32 0, i32* %ptr		store i32 0, i32* %ptr
ret void		ret void
}		}

; GCN-LABEL: @pos_vaddr_offse		; GCN-LABEL: @pos_vaddr_offse
; DEFAULT-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16		; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16
; HUGE-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}}
define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {		define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
entry:		entry:
%array = alloca [8192 x i32]		%array = alloca [8192 x i32]
%ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 4		%ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 4
store i32 0, i32* %ptr		store i32 0, i32* %ptr
%load_ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 %offset		%load_ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 %offset
%val = load i32, i32* %load_ptr		%val = load i32, i32* %load_ptr
store i32 %val, i32 addrspace(1)* %out		store i32 %val, i32 addrspace(1)* %out
ret void		ret void
}		}