This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Limit promote alloca to vector with VGPR budget
ClosedPublic

Authored by rampitec on Jul 1 2020, 12:11 PM.

Download Raw Diff

Details

Reviewers

arsenm
kerbowa

Commits

rG54e2dc7537dd: [AMDGPU] Limit promote alloca to vector with VGPR budget

Summary

Allow only up to 1/4 of available VGPRs for the vectorization
of any given alloca.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

rampitec created this revision.Jul 1 2020, 12:11 PM

Herald added a project: Restricted Project. · View Herald TranscriptJul 1 2020, 12:11 PM

Herald added subscribers: hiraditya, t-tye, tpr and 6 others. · View Herald Transcript

arsenm added inline comments.Jul 1 2020, 12:30 PM

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
444	This seems like a huge default. I thought we previously limited this to 16 VGPRs. There should also probably be a cl::opt for this too

rampitec marked an inline comment as done.Jul 1 2020, 1:02 PM

rampitec added inline comments.

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
444	We did not limit it to 16 VGPRs but to 16 elements. This limit is still here. The problem this limit solves is different, it is when you only have limited number of registers not to run out of them. Like wg size 1024 leaves us with 64 VGPRs and then the limit would be 16.

Added cl::opt.

arsenm accepted this revision.Jul 1 2020, 3:26 PM

This revision is now accepted and ready to land.Jul 1 2020, 3:26 PM

Closed by commit rG54e2dc7537dd: [AMDGPU] Limit promote alloca to vector with VGPR budget (authored by rampitec). · Explain WhyJul 1 2020, 4:15 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

AMDGPUPromoteAlloca.cpp

50 lines

test/

CodeGen/

AMDGPU/

vector-alloca-limits.ll

136 lines

Diff 274953

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Show First 20 Lines • Show All 70 Lines • ▼ Show 20 Lines	static cl::opt<bool> DisablePromoteAllocaToVector(
cl::desc("Disable promote alloca to vector"),		cl::desc("Disable promote alloca to vector"),
cl::init(false));		cl::init(false));

static cl::opt<bool> DisablePromoteAllocaToLDS(		static cl::opt<bool> DisablePromoteAllocaToLDS(
"disable-promote-alloca-to-lds",		"disable-promote-alloca-to-lds",
cl::desc("Disable promote alloca to LDS"),		cl::desc("Disable promote alloca to LDS"),
cl::init(false));		cl::init(false));

		static cl::opt<unsigned> PromoteAllocaToVectorLimit(
		"amdgpu-promote-alloca-to-vector-limit",
		cl::desc("Maximum byte size to consider promote alloca to vector"),
		cl::init(0));

// FIXME: This can create globals so should be a module pass.		// FIXME: This can create globals so should be a module pass.
class AMDGPUPromoteAlloca : public FunctionPass {		class AMDGPUPromoteAlloca : public FunctionPass {
private:		private:
const TargetMachine *TM;		const TargetMachine *TM;
Module *Mod = nullptr;		Module *Mod = nullptr;
const DataLayout *DL = nullptr;		const DataLayout *DL = nullptr;

// FIXME: This should be per-kernel.		// FIXME: This should be per-kernel.
uint32_t LocalMemLimit = 0;		uint32_t LocalMemLimit = 0;
uint32_t CurrentLocalMemUsage = 0;		uint32_t CurrentLocalMemUsage = 0;
		unsigned MaxVGPRs;

bool IsAMDGCN = false;		bool IsAMDGCN = false;
bool IsAMDHSA = false;		bool IsAMDHSA = false;

std::pair<Value , Value > getLocalSizeYZ(IRBuilder<> &Builder);		std::pair<Value , Value > getLocalSizeYZ(IRBuilder<> &Builder);
Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);		Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);

/// BaseAlloca is the alloca root the search started from.		/// BaseAlloca is the alloca root the search started from.
Show All 27 Lines	public:

void getAnalysisUsage(AnalysisUsage &AU) const override {		void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();		AU.setPreservesCFG();
FunctionPass::getAnalysisUsage(AU);		FunctionPass::getAnalysisUsage(AU);
}		}
};		};

class AMDGPUPromoteAllocaToVector : public FunctionPass {		class AMDGPUPromoteAllocaToVector : public FunctionPass {
		private:
		unsigned MaxVGPRs;

public:		public:
static char ID;		static char ID;

AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}		AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}

bool runOnFunction(Function &F) override;		bool runOnFunction(Function &F) override;

StringRef getPassName() const override {		StringRef getPassName() const override {
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
const Triple &TT = TM->getTargetTriple();		const Triple &TT = TM->getTargetTriple();
IsAMDGCN = TT.getArch() == Triple::amdgcn;		IsAMDGCN = TT.getArch() == Triple::amdgcn;
IsAMDHSA = TT.getOS() == Triple::AMDHSA;		IsAMDHSA = TT.getOS() == Triple::AMDHSA;

const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);		const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
if (!ST.isPromoteAllocaEnabled())		if (!ST.isPromoteAllocaEnabled())
return false;		return false;

		if (IsAMDGCN) {
		const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
		MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
		} else {
		MaxVGPRs = 128;
		}

bool SufficientLDS = hasSufficientLocalMem(F);		bool SufficientLDS = hasSufficientLocalMem(F);
bool Changed = false;		bool Changed = false;
BasicBlock &EntryBB = *F.begin();		BasicBlock &EntryBB = *F.begin();

SmallVector<AllocaInst *, 16> Allocas;		SmallVector<AllocaInst *, 16> Allocas;
for (Instruction &I : EntryBB) {		for (Instruction &I : EntryBB) {
if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))		if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
Allocas.push_back(AI);		Allocas.push_back(AI);
▲ Show 20 Lines • Show All 207 Lines • ▼ Show 20 Lines	return (SI->getPointerOperand() == User) &&
UserInst->getOpcode() == Instruction::BitCast) &&		UserInst->getOpcode() == Instruction::BitCast) &&
SI->isSimple();		SI->isSimple();
}		}
default:		default:
return false;		return false;
}		}
}		}

static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {		static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
		unsigned MaxVGPRs) {

if (DisablePromoteAllocaToVector) {		if (DisablePromoteAllocaToVector) {
LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");		LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
return false;		return false;
}		}

Type *AllocaTy = Alloca->getAllocatedType();		Type *AllocaTy = Alloca->getAllocatedType();
auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);		auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {		if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
if (VectorType::isValidElementType(ArrayTy->getElementType()) &&		if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
ArrayTy->getNumElements() > 0)		ArrayTy->getNumElements() > 0)
VectorTy = arrayTypeToVecType(ArrayTy);		VectorTy = arrayTypeToVecType(ArrayTy);
}		}

		// Use up to 1/4 of available register budget for vectorization.
		arsenmUnsubmitted Done Reply Inline Actions This seems like a huge default. I thought we previously limited this to 16 VGPRs. There should also probably be a cl::opt for this too arsenm: This seems like a huge default. I thought we previously limited this to 16 VGPRs. There should…
		rampitecAuthorUnsubmitted Done Reply Inline Actions We did not limit it to 16 VGPRs but to 16 elements. This limit is still here. The problem this limit solves is different, it is when you only have limited number of registers not to run out of them. Like wg size 1024 leaves us with 64 VGPRs and then the limit would be 16. rampitec: We did not limit it to 16 VGPRs but to 16 elements. This limit is still here. The problem this…
		unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
		: (MaxVGPRs * 32);

		if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
		LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with "
		<< MaxVGPRs << " registers available\n");
		return false;
		}

LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");		LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");

// FIXME: There is no reason why we can't support larger arrays, we		// FIXME: There is no reason why we can't support larger arrays, we
// are just being conservative for now.		// are just being conservative for now.
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these		// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
// could also be promoted but we don't currently handle this case		// could also be promoted but we don't currently handle this case
if (!VectorTy \|\| VectorTy->getNumElements() > 16 \|\|		if (!VectorTy \|\| VectorTy->getNumElements() > 16 \|\|
VectorTy->getNumElements() < 2) {		VectorTy->getNumElements() < 2) {
▲ Show 20 Lines • Show All 366 Lines • ▼ Show 20 Lines	bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
const DataLayout &DL = Mod->getDataLayout();		const DataLayout &DL = Mod->getDataLayout();
IRBuilder<> Builder(&I);		IRBuilder<> Builder(&I);

// First try to replace the alloca with a vector		// First try to replace the alloca with a vector
Type *AllocaTy = I.getAllocatedType();		Type *AllocaTy = I.getAllocatedType();

LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');		LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');

if (tryPromoteAllocaToVector(&I, DL))		if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
return true; // Promoted to vector.		return true; // Promoted to vector.

if (DisablePromoteAllocaToLDS)		if (DisablePromoteAllocaToLDS)
return false;		return false;

const Function &ContainingFunction = *I.getParent()->getParent();		const Function &ContainingFunction = *I.getParent()->getParent();
CallingConv::ID CC = ContainingFunction.getCallingConv();		CallingConv::ID CC = ContainingFunction.getCallingConv();

▲ Show 20 Lines • Show All 193 Lines • ▼ Show 20 Lines	bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
}		}
return true;		return true;
}		}

bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {		bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
if (skipFunction(F) \|\| DisablePromoteAllocaToVector)		if (skipFunction(F) \|\| DisablePromoteAllocaToVector)
return false;		return false;

		const TargetMachine *TM;
		if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
		TM = &TPC->getTM<TargetMachine>();
		else
		return false;

		const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
		if (!ST.isPromoteAllocaEnabled())
		return false;

		if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
		const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
		MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
		} else {
		MaxVGPRs = 128;
		}

bool Changed = false;		bool Changed = false;
BasicBlock &EntryBB = *F.begin();		BasicBlock &EntryBB = *F.begin();

SmallVector<AllocaInst *, 16> Allocas;		SmallVector<AllocaInst *, 16> Allocas;
for (Instruction &I : EntryBB) {		for (Instruction &I : EntryBB) {
if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))		if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
Allocas.push_back(AI);		Allocas.push_back(AI);
}		}
Show All 10 Lines	bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
// Array allocations are probably not worth handling, since an allocation of		// Array allocations are probably not worth handling, since an allocation of
// the array type is the canonical form.		// the array type is the canonical form.
if (!I.isStaticAlloca() \|\| I.isArrayAllocation())		if (!I.isStaticAlloca() \|\| I.isArrayAllocation())
return false;		return false;

LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');		LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');

Module *Mod = I.getParent()->getParent()->getParent();		Module *Mod = I.getParent()->getParent()->getParent();
return tryPromoteAllocaToVector(&I, Mod->getDataLayout());		return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
}		}

FunctionPass *llvm::createAMDGPUPromoteAlloca() {		FunctionPass *llvm::createAMDGPUPromoteAlloca() {
return new AMDGPUPromoteAlloca();		return new AMDGPUPromoteAlloca();
}		}

FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {		FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
return new AMDGPUPromoteAllocaToVector();		return new AMDGPUPromoteAllocaToVector();
}		}

llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll

This file was added.

				; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s \| FileCheck -check-prefix=OPT %s
				; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine -amdgpu-promote-alloca-to-vector-limit=32 < %s \| FileCheck -check-prefix=LIMIT32 %s

				target datalayout = "A5"

				; OPT-LABEL: @alloca_8xi64_max1024(
				; OPT-NOT: alloca
				; OPT: <8 x i64>
				; LIMIT32: alloca
				; LIMIT32-NOT: <8 x i64>
				define amdgpu_kernel void @alloca_8xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
				entry:
				%tmp = alloca [8 x i64], addrspace(5)
				%x = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 0
				store i64 0, i64 addrspace(5)* %x
				%tmp1 = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 %index
				%tmp2 = load i64, i64 addrspace(5)* %tmp1
				store i64 %tmp2, i64 addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @alloca_9xi64_max1024(
				; OPT: alloca [9 x i64]
				; OPT-NOT: <9 x i64>
				; LIMIT32: alloca
				; LIMIT32-NOT: <9 x i64>
				define amdgpu_kernel void @alloca_9xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
				entry:
				%tmp = alloca [9 x i64], addrspace(5)
				%x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
				store i64 0, i64 addrspace(5)* %x
				%tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
				%tmp2 = load i64, i64 addrspace(5)* %tmp1
				store i64 %tmp2, i64 addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @alloca_16xi64_max512(
				; OPT-NOT: alloca
				; OPT: <16 x i64>
				; LIMIT32: alloca
				; LIMIT32-NOT: <16 x i64>
				define amdgpu_kernel void @alloca_16xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
				entry:
				%tmp = alloca [16 x i64], addrspace(5)
				%x = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 0
				store i64 0, i64 addrspace(5)* %x
				%tmp1 = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 %index
				%tmp2 = load i64, i64 addrspace(5)* %tmp1
				store i64 %tmp2, i64 addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @alloca_17xi64_max512(
				; OPT: alloca [17 x i64]
				; OPT-NOT: <17 x i64>
				; LIMIT32: alloca
				; LIMIT32-NOT: <17 x i64>
				define amdgpu_kernel void @alloca_17xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
				entry:
				%tmp = alloca [17 x i64], addrspace(5)
				%x = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 0
				store i64 0, i64 addrspace(5)* %x
				%tmp1 = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 %index
				%tmp2 = load i64, i64 addrspace(5)* %tmp1
				store i64 %tmp2, i64 addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @alloca_9xi128_max512(
				; OPT: alloca [9 x i128]
				; OPT-NOT: <9 x i128>
				; LIMIT32: alloca
				; LIMIT32-NOT: <9 x i128>
				define amdgpu_kernel void @alloca_9xi128_max512(i128 addrspace(1)* %out, i32 %index) #1 {
				entry:
				%tmp = alloca [9 x i128], addrspace(5)
				%x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
				store i128 0, i128 addrspace(5)* %x
				%tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
				%tmp2 = load i128, i128 addrspace(5)* %tmp1
				store i128 %tmp2, i128 addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @alloca_9xi128_max256(
				; OPT-NOT: alloca
				; OPT: <9 x i128>
				; LIMIT32: alloca
				; LIMIT32-NOT: <9 x i128>
				define amdgpu_kernel void @alloca_9xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
				entry:
				%tmp = alloca [9 x i128], addrspace(5)
				%x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
				store i128 0, i128 addrspace(5)* %x
				%tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
				%tmp2 = load i128, i128 addrspace(5)* %tmp1
				store i128 %tmp2, i128 addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @alloca_16xi128_max256(
				; OPT-NOT: alloca
				; OPT: <16 x i128>
				; LIMIT32: alloca
				; LIMIT32-NOT: <16 x i128>
				define amdgpu_kernel void @alloca_16xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
				entry:
				%tmp = alloca [16 x i128], addrspace(5)
				%x = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 0
				store i128 0, i128 addrspace(5)* %x
				%tmp1 = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 %index
				%tmp2 = load i128, i128 addrspace(5)* %tmp1
				store i128 %tmp2, i128 addrspace(1)* %out
				ret void
				}

				; OPT-LABEL: @alloca_9xi256_max256(
				; OPT: alloca [9 x i256]
				; OPT-NOT: <9 x i256>
				; LIMIT32: alloca
				; LIMIT32-NOT: <9 x i256>
				define amdgpu_kernel void @alloca_9xi256_max256(i256 addrspace(1)* %out, i32 %index) #2 {
				entry:
				%tmp = alloca [9 x i256], addrspace(5)
				%x = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 0
				store i256 0, i256 addrspace(5)* %x
				%tmp1 = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 %index
				%tmp2 = load i256, i256 addrspace(5)* %tmp1
				store i256 %tmp2, i256 addrspace(1)* %out
				ret void
				}

				attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" }
				attributes #1 = { "amdgpu-flat-work-group-size"="1,512" }
				attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }