Diff 53719

include/llvm/Support/MathExtras.h

	Show First 20 Lines • Show All 615 Lines • ▼ Show 20 Lines
	/// alignTo(~0LL, 8, 3) = 3			/// alignTo(~0LL, 8, 3) = 3
	/// alignTo(321, 255, 42) = 552			/// alignTo(321, 255, 42) = 552
	/// \endcode			/// \endcode
	inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {			inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
	Skew %= Align;			Skew %= Align;
	return (Value + Align - 1 - Skew) / Align * Align + Skew;			return (Value + Align - 1 - Skew) / Align * Align + Skew;
	}			}

				/// Returns the largest uint64_t less than or equal to \p Value and is
				/// \p Skew mod \p Align. \p Align must be non-zero
				inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
				Skew %= Align;
				return (Value - Skew) / Align * Align + Skew;
				}

	/// Returns the offset to the next integer (mod 2**64) that is greater than			/// Returns the offset to the next integer (mod 2**64) that is greater than
	/// or equal to \p Value and is a multiple of \p Align. \p Align must be			/// or equal to \p Value and is a multiple of \p Align. \p Align must be
	/// non-zero.			/// non-zero.
	inline uint64_t OffsetToAlignment(uint64_t Value, uint64_t Align) {			inline uint64_t OffsetToAlignment(uint64_t Value, uint64_t Align) {
	return alignTo(Value, Align) - Value;			return alignTo(Value, Align) - Value;
	}			}

	/// SignExtend32 - Sign extend B-bit number x to 32-bit int.			/// SignExtend32 - Sign extend B-bit number x to 32-bit int.
	▲ Show 20 Lines • Show All 110 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Show First 20 Lines • Show All 475 Lines • ▼ Show 20 Lines	for (User *User : Val->users()) {
if (!collectUsesWithPtrTypes(User, WorkList))		if (!collectUsesWithPtrTypes(User, WorkList))
return false;		return false;
}		}

return true;		return true;
}		}

void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {		void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
if (!I.isStaticAlloca())		if (!I.isStaticAlloca())
		arsenmUnsubmitted Not Done Reply Inline Actions & should be on RHS and this should be moved down to uses arsenm: & should be on RHS and this should be moved down to uses
return;		return;

IRBuilder<> Builder(&I);		IRBuilder<> Builder(&I);

// First try to replace the alloca with a vector		// First try to replace the alloca with a vector
Type *AllocaTy = I.getAllocatedType();		Type *AllocaTy = I.getAllocatedType();

DEBUG(dbgs() << "Trying to promote " << I << '\n');		DEBUG(dbgs() << "Trying to promote " << I << '\n');

if (tryPromoteAllocaToVector(&I))		if (tryPromoteAllocaToVector(&I))
return;		return;

DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");		DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");

// FIXME: This is the maximum work group size. We should try to get		const Function &ContainingFunction = *I.getParent()->getParent();
// value from the reqd_work_group_size function attribute if it is
// available.		// FIXME: We should also try to get this value from the reqd_work_group_size
unsigned WorkGroupSize = 256;		// function attribute if it is available.
		unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);

int AllocaSize =		int AllocaSize =
WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);		WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
		nhaehnleUnsubmitted Not Done Reply Inline Actions Fix the comment :) nhaehnle: Fix the comment :)
		bnieuwenhuizenAuthorUnsubmitted Not Done Reply Inline Actions We don't technically handle that function attribute yet. I can move it somewhere more appropiate, like getMaxWorkGroupSize or also make getMaxWorkGroupSize detect and use reqd_work_group_size. I noticed though that we have all the information for reqd_work_group_size in mesa (unless we want to support ARB_compute_variable_group_size). Maybe it is worth it to switch completely to reqd_work_group_size. bnieuwenhuizen: We don't technically handle that function attribute yet. I can move it somewhere more…
		nhaehnleUnsubmitted Not Done Reply Inline Actions I see what you mean. The semantics are also slightly different. It does makes sense to keep a comment about this. nhaehnle: I see what you mean. The semantics are also slightly different. It does makes sense to keep a…

if (AllocaSize > LocalMemAvailable) {		if (AllocaSize > LocalMemAvailable) {
DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");		DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
return;		return;
}		}

std::vector<Value*> WorkList;		std::vector<Value*> WorkList;

if (!collectUsesWithPtrTypes(&I, WorkList)) {		if (!collectUsesWithPtrTypes(&I, WorkList)) {
DEBUG(dbgs() << " Do not know how to convert all uses\n");		DEBUG(dbgs() << " Do not know how to convert all uses\n");
return;		return;
}		}

DEBUG(dbgs() << "Promoting alloca to local memory\n");		DEBUG(dbgs() << "Promoting alloca to local memory\n");
LocalMemAvailable -= AllocaSize;		LocalMemAvailable -= AllocaSize;

Function *F = I.getParent()->getParent();		Function *F = I.getParent()->getParent();

Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);		Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
GlobalVariable *GV = new GlobalVariable(		GlobalVariable *GV = new GlobalVariable(
*Mod, GVTy, false, GlobalValue::InternalLinkage,		*Mod, GVTy, false, GlobalValue::InternalLinkage,
UndefValue::get(GVTy),		UndefValue::get(GVTy),
Twine(F->getName()) + Twine('.') + I.getName(),		Twine(F->getName()) + Twine('.') + I.getName(),
nullptr,		nullptr,
GlobalVariable::NotThreadLocal,		GlobalVariable::NotThreadLocal,
AMDGPUAS::LOCAL_ADDRESS);		AMDGPUAS::LOCAL_ADDRESS);
GV->setUnnamedAddr(true);		GV->setUnnamedAddr(true);
▲ Show 20 Lines • Show All 126 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIMachineFunctionInfo.h

Show First 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
unsigned WorkGroupIDZSystemSGPR;		unsigned WorkGroupIDZSystemSGPR;
unsigned WorkGroupInfoSystemSGPR;		unsigned WorkGroupInfoSystemSGPR;
unsigned PrivateSegmentWaveByteOffsetSystemSGPR;		unsigned PrivateSegmentWaveByteOffsetSystemSGPR;

// Graphics info.		// Graphics info.
unsigned PSInputAddr;		unsigned PSInputAddr;
bool ReturnsVoid;		bool ReturnsVoid;

		unsigned MaximumWorkGroupSize;

public:		public:
// FIXME: Make private		// FIXME: Make private
unsigned LDSWaveSpillSize;		unsigned LDSWaveSpillSize;
unsigned PSInputEna;		unsigned PSInputEna;
std::map<unsigned, unsigned> LaneVGPRs;		std::map<unsigned, unsigned> LaneVGPRs;
unsigned ScratchOffsetReg;		unsigned ScratchOffsetReg;
unsigned NumUserSGPRs;		unsigned NumUserSGPRs;
unsigned NumSystemSGPRs;		unsigned NumSystemSGPRs;
▲ Show 20 Lines • Show All 254 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines	: AMDGPUMachineFunction(MF),
GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),		GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),		WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),		WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),		WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),		WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),		PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
PSInputAddr(0),		PSInputAddr(0),
ReturnsVoid(true),		ReturnsVoid(true),
		MaximumWorkGroupSize(0),
		arsenmUnsubmitted Not Done Reply Inline Actions Should be set to 0 here arsenm: Should be set to 0 here
LDSWaveSpillSize(0),		LDSWaveSpillSize(0),
PSInputEna(0),		PSInputEna(0),
NumUserSGPRs(0),		NumUserSGPRs(0),
NumSystemSGPRs(0),		NumSystemSGPRs(0),
HasSpilledSGPRs(false),		HasSpilledSGPRs(false),
HasSpilledVGPRs(false),		HasSpilledVGPRs(false),
HasNonSpillStackObjects(false),		HasNonSpillStackObjects(false),
HasFlatInstructions(false),		HasFlatInstructions(false),
▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
}		}

// We don't need to worry about accessing spills with flat instructions.		// We don't need to worry about accessing spills with flat instructions.
// TODO: On VI where we must use flat for global, we should be able to omit		// TODO: On VI where we must use flat for global, we should be able to omit
// this if it is never used for generic access.		// this if it is never used for generic access.
if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS &&		if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS &&
ST.isAmdHsaOS())		ST.isAmdHsaOS())
FlatScratchInit = true;		FlatScratchInit = true;

		if (AMDGPU::isCompute(F->getCallingConv()))
		arsenmUnsubmitted Not Done Reply Inline Actions Why is this function needed? You can just compare to F->getCallingConv directly arsenm: Why is this function needed? You can just compare to F->getCallingConv directly
		bnieuwenhuizenAuthorUnsubmitted Not Done Reply Inline Actions I want to have everything in the compute stage here. Note that we have multiple calling conventions for that: at least one for compute kernels and one for mesa shaders. This is a helper to check if it is one of the compute stage calling conventions, so we only have one place we need to remember to put all the conventions. bnieuwenhuizen: I want to have everything in the compute stage here. Note that we have multiple calling…
		MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F);
		else
		MaximumWorkGroupSize = ST.getWavefrontSize();
}		}

unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(		unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
const SIRegisterInfo &TRI) {		const SIRegisterInfo &TRI) {
PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(		PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);		getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
NumUserSGPRs += 4;		NumUserSGPRs += 4;
return PrivateSegmentBufferUserSGPR;		return PrivateSegmentBufferUserSGPR;
▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines	SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
}		}

Spill.VGPR = LaneVGPRs[LaneVGPRIdx];		Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
return Spill;		return Spill;
}		}

unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(		unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
const MachineFunction &MF) const {		const MachineFunction &MF) const {
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();		return MaximumWorkGroupSize;
// FIXME: We should get this information from kernel attributes if it
// is available.
if (AMDGPU::isCompute(MF.getFunction()->getCallingConv()))
return 256;
return ST.getWavefrontSize();
}		}

lib/Target/AMDGPU/SIRegisterInfo.cpp

Show All 17 Lines
#include "llvm/CodeGen/MachineFrameInfo.h"		#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"		#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RegisterScavenging.h"		#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/IR/Function.h"		#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"		#include "llvm/IR/LLVMContext.h"

using namespace llvm;		using namespace llvm;

		static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) {
		const SIMachineFunctionInfo& MFI = *MF.getInfo<SIMachineFunctionInfo>();
		const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
		unsigned SIMDPerCU = 4;

		unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize();
		nhaehnleUnsubmitted Not Done Reply Inline Actions I find the min/max naming here confusing. Surely this function should be named getMaxWaveCount? Also, what about getMaxWaveCountPerSIMD? nhaehnle: I find the min/max naming here confusing. Surely this function should be named getMaxWaveCount?
		return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) /
		MaxInvocationsPerWave;
		}

		static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) {
		const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
		unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);

		arsenmUnsubmitted Not Done Reply Inline Actions This should not be a magic number. Also MaxWorkgroupSGPRCount would be a better name arsenm: This should not be a magic number. Also MaxWorkgroupSGPRCount would be a better name
		unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment;
		arsenmUnsubmitted Not Done Reply Inline Actions There is a alignTo function you should use arsenm: There is a alignTo function you should use
		bnieuwenhuizenAuthorUnsubmitted Not Done Reply Inline Actions Note that alignTo rounds up, this rounds down. bnieuwenhuizen: Note that alignTo rounds up, this rounds down.
		unsigned ReservedSGPRCount;

		if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
		nhaehnleUnsubmitted Not Done Reply Inline Actions VI+ rounds SGPR allocations to multiples of 16. nhaehnle: VI+ rounds SGPR allocations to multiples of 16.
		TotalSGPRCountPerSIMD = 800;
		AddressableSGPRCount = 102;
		SGPRUsageAlignment = 16;
		ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK
		} else {
		TotalSGPRCountPerSIMD = 512;
		AddressableSGPRCount = 104;
		SGPRUsageAlignment = 8;
		ReservedSGPRCount = 2; // VCC
		}
		nhaehnleUnsubmitted Not Done Reply Inline Actions As Marek has pointed out in the past, the calculations surrounding SGPR allocations are a bit questionable. The 102/104 limit comes from the instruction encoding, which can encode 104 SGPRs in <= CIK and 102 SGPRs in >= VI. The subtraction of 2/6 comes from VCC/XNACK/FLAT_SCR - but we don't have to address those as regular SGPRs! So on VI+, it's perfectly fine to use 102 SGPRs, you just then have to tell the hardware to reserve >= 108 slots in the SGPR register bank. Due to rounding up to multiples of 16, this ends up allocating 112 SGPRs, which does waste four slots per wave, but whatever. In practice, what this means for your calculation is that it should be std::min(AllowedSGPRCount - 2/6, 104/102) (except for the InitBug thing, where you should indeed clamp AllowedSGPRCount before taking space for VCC & friends. nhaehnle: As Marek has pointed out in the past, the calculations surrounding SGPR allocations are a bit…

		unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD);
		MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment);

		if (ST.hasSGPRInitBug())
		MaxSGPRCount = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;

		return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount);
		}
		nhaehnleUnsubmitted Not Done Reply Inline Actions I was confused by that closing brace for a moment. I'm not sure what the LLVM coding style says if anything, but personally I find something like `// anonymous namespace` helpful. nhaehnle: I was confused by that closing brace for a moment. I'm not sure what the LLVM coding style says…
		bnieuwenhuizenAuthorUnsubmitted Not Done Reply Inline Actions Seems like The LLVM coding style wants static instead of anonymous namespaces for functions (http://llvm.org/docs/CodingStandards.html#anonymous-namespaces). I will change that. bnieuwenhuizen: Seems like The LLVM coding style wants static instead of anonymous namespaces for functions…
		nhaehnleUnsubmitted Not Done Reply Inline Actions Thanks. With that changed, and arguably with a small alignDown helper, the patch LGTM. nhaehnle: Thanks. With that changed, and arguably with a small alignDown helper, the patch LGTM.

		static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) {
		unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
		unsigned TotalVGPRCountPerSIMD = 256;
		unsigned VGPRUsageAlignment = 4;

		return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD,
		VGPRUsageAlignment);
		}

static bool hasPressureSet(const int *PSets, unsigned PSetID) {		static bool hasPressureSet(const int *PSets, unsigned PSetID) {
for (unsigned i = 0; PSets[i] != -1; ++i) {		for (unsigned i = 0; PSets[i] != -1; ++i) {
if (PSets[i] == (int)PSetID)		if (PSets[i] == (int)PSetID)
return true;		return true;
}		}
return false;		return false;
}		}

Show All 32 Lines	void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
MCRegAliasIterator R(Reg, this, true);		MCRegAliasIterator R(Reg, this, true);

for (; R.isValid(); ++R)		for (; R.isValid(); ++R)
Reserved.set(*R);		Reserved.set(*R);
}		}

unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(		unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
const MachineFunction &MF) const {		const MachineFunction &MF) const {
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();		unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4;
if (ST.hasSGPRInitBug()) {
// Leave space for flat_scr, xnack_mask, vcc, and alignment
unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4;
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));		unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);		return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
}		}

if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and
// 100/101 for vcc. This is the next sgpr128 down.
return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
}

return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
}

unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(		unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const {		const MachineFunction &MF) const {
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();		unsigned RegCount = getMaxWorkGroupSGPRCount(MF);
if (ST.hasSGPRInitBug()) {		unsigned Reg;
unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
return AMDGPU::SGPR_32RegClass.getRegister(Idx);
}

if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {		// Try to place it in a hole after PrivateSegmentbufferReg.
// Next register before reservations for flat_scr, xnack_mask, vcc,		if (RegCount & 3) {
// and scratch resource.		// We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
return AMDGPU::SGPR91;		// alignment constraints, so we have a hole where can put the wave offset.
		Reg = RegCount - 1;
		} else {
		// We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
		// wave offset before it.
		Reg = RegCount - 5;
}		}
		return AMDGPU::SGPR_32RegClass.getRegister(Reg);
return AMDGPU::SGPR95;
}		}

		arsenmUnsubmitted Not Done Reply Inline Actions Space after if arsenm: Space after if
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {		BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());		BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);		Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
		arsenmUnsubmitted Not Done Reply Inline Actions Magic numbers arsenm: Magic numbers

// EXEC_LO and EXEC_HI could be allocated and used as regular register, but		// EXEC_LO and EXEC_HI could be allocated and used as regular register, but
// this seems likely to result in bugs, so I'm marking them as reserved.		// this seems likely to result in bugs, so I'm marking them as reserved.
reserveRegisterTuples(Reserved, AMDGPU::EXEC);		reserveRegisterTuples(Reserved, AMDGPU::EXEC);
reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);		reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);

// Reserve Trap Handler registers - support is not implemented in Codegen.		// Reserve Trap Handler registers - support is not implemented in Codegen.
reserveRegisterTuples(Reserved, AMDGPU::TBA);		reserveRegisterTuples(Reserved, AMDGPU::TBA);
reserveRegisterTuples(Reserved, AMDGPU::TMA);		reserveRegisterTuples(Reserved, AMDGPU::TMA);
reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);		reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);		reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);		reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);		reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);		reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);		reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);

// Reserve the last 2 registers so we will always have at least 2 more that		unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF);
// will physically contain VCC.		unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF);
reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103);

const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();

if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
// for VCC/XNACK_MASK/FLAT_SCR.
//
// TODO The SGPRs that alias to XNACK_MASK could be used as general purpose
// SGPRs when the XNACK feature is not used. This is currently not done
// because the code that counts SGPRs cannot account for such holes.
reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
}

// Tonga and Iceland can only allocate a fixed number of SGPRs due
// to a hw bug.
if (ST.hasSGPRInitBug()) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();		unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
// Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs).		unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6;		for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) {

for (unsigned i = Limit; i < NumSGPRs; ++i) {
unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);		unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
reserveRegisterTuples(Reserved, Reg);		reserveRegisterTuples(Reserved, Reg);
}		}


		for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) {
		unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
		reserveRegisterTuples(Reserved, Reg);
}		}

const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();		const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();		unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {		if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
// Reserve 1 SGPR for scratch wave offset in case we need to spill.		// Reserve 1 SGPR for scratch wave offset in case we need to spill.
reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);		reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
▲ Show 20 Lines • Show All 669 Lines • Show Last 20 Lines

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

	Show All 39 Lines
	MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);			MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);

	MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);			MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);

	bool isGroupSegment(const GlobalValue *GV);			bool isGroupSegment(const GlobalValue *GV);
	bool isGlobalSegment(const GlobalValue *GV);			bool isGlobalSegment(const GlobalValue *GV);
	bool isReadOnlySegment(const GlobalValue *GV);			bool isReadOnlySegment(const GlobalValue *GV);

				unsigned getMaximumWorkGroupSize(const Function &F);
	unsigned getInitialPSInputAddr(const Function &F);			unsigned getInitialPSInputAddr(const Function &F);

	bool isShader(CallingConv::ID cc);			bool isShader(CallingConv::ID cc);
	bool isCompute(CallingConv::ID cc);			bool isCompute(CallingConv::ID cc);

	bool isSI(const MCSubtargetInfo &STI);			bool isSI(const MCSubtargetInfo &STI);
	bool isCI(const MCSubtargetInfo &STI);			bool isCI(const MCSubtargetInfo &STI);
	bool isVI(const MCSubtargetInfo &STI);			bool isVI(const MCSubtargetInfo &STI);
	Show All 9 Lines

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Show First 20 Lines • Show All 118 Lines • ▼ Show 20 Lines	if (A.isStringAttribute()) {
if (Str.getAsInteger(0, Result)) {		if (Str.getAsInteger(0, Result)) {
LLVMContext &Ctx = F.getContext();		LLVMContext &Ctx = F.getContext();
Ctx.emitError("can't parse shader type");		Ctx.emitError("can't parse shader type");
}		}
}		}
return Result;		return Result;
}		}

		unsigned getMaximumWorkGroupSize(const Function &F) {
		return getIntegerAttribute(F, "amdgpu-max-work-group-size", 256);
		nhaehnleUnsubmitted Not Done Reply Inline Actions According to Matt, there should be an amdgpu- prefix here. nhaehnle: According to Matt, there should be an amdgpu- prefix here.
		arsenmUnsubmitted Not Done Reply Inline Actions I would also probably reduce this to -max- rather than -maximum- since most other maximums do this arsenm: I would also probably reduce this to -max- rather than -maximum- since most other maximums do…
		}

unsigned getInitialPSInputAddr(const Function &F) {		unsigned getInitialPSInputAddr(const Function &F) {
return getIntegerAttribute(F, "InitialPSInputAddr", 0);		return getIntegerAttribute(F, "InitialPSInputAddr", 0);
}		}

bool isShader(CallingConv::ID cc) {		bool isShader(CallingConv::ID cc) {
switch(cc) {		switch(cc) {
case CallingConv::AMDGPU_VS:		case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_GS:		case CallingConv::AMDGPU_GS:
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll

This file was added.

				; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s \| FileCheck %s

				; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4

				define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
				entry:
				%stack = alloca [5 x i32], align 4
				%0 = load i32, i32 addrspace(1)* %in, align 4
				%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
				store i32 4, i32* %arrayidx1, align 4
				%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
				%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
				%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
				store i32 5, i32* %arrayidx3, align 4
				%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
				%2 = load i32, i32* %arrayidx10, align 4
				store i32 %2, i32 addrspace(1)* %out, align 4
				%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
				%3 = load i32, i32* %arrayidx12
				%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
				store i32 %3, i32 addrspace(1)* %arrayidx13
				ret void
				}

				; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4

				arsenmUnsubmitted Not Done Reply Inline Actions Attributes should be moved to the bottom. Also run the test through opt with no arguments to simplify the sets arsenm: Attributes should be moved to the bottom. Also run the test through opt with no arguments to…
				define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
				entry:
				%stack = alloca [5 x i32], align 4
				%0 = load i32, i32 addrspace(1)* %in, align 4
				%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
				store i32 4, i32* %arrayidx1, align 4
				%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
				%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
				%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
				store i32 5, i32* %arrayidx3, align 4
				%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
				%2 = load i32, i32* %arrayidx10, align 4
				store i32 %2, i32 addrspace(1)* %out, align 4
				%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
				%3 = load i32, i32* %arrayidx12
				%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
				store i32 %3, i32 addrspace(1)* %arrayidx13
				ret void
				}

				; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4

				define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
				entry:
				%stack = alloca [5 x i32], align 4
				%0 = load i32, i32 addrspace(1)* %in, align 4
				%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
				store i32 4, i32* %arrayidx1, align 4
				%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
				%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
				%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
				store i32 5, i32* %arrayidx3, align 4
				%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
				%2 = load i32, i32* %arrayidx10, align 4
				store i32 %2, i32 addrspace(1)* %out, align 4
				%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
				%3 = load i32, i32* %arrayidx12
				%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
				store i32 %3, i32 addrspace(1)* %arrayidx13
				ret void
				}

				attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
				attributes #1 = { nounwind "amdgpu-max-work-group-size"="256" }
				attributes #2 = { nounwind "amdgpu-max-work-group-size"="1600" }

test/CodeGen/AMDGPU/large-work-group-registers.ll

This file was added.

				; RUN: llc -march=amdgcn -mcpu=tonga < %s \| FileCheck %s

				; CHECK: NumVgprs: 63
				define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 {
				main_body:
				%8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8
				%9 = load <4 x i32>, <4 x i32> addrspace(2)* %8, align 16, !tbaa !0
				%10 = extractelement <3 x i32> %7, i32 0
				%11 = extractelement <3 x i32> %7, i32 1
				%12 = mul i32 %10, %11
				%bc = bitcast <3 x i32> %7 to <3 x float>
				%13 = extractelement <3 x float> %bc, i32 1
				%14 = insertelement <512 x float> undef, float %13, i32 %12
				call void @llvm.amdgcn.s.barrier()
				%15 = extractelement <3 x i32> %6, i32 0
				%16 = extractelement <3 x i32> %7, i32 0
				%17 = shl i32 %15, 5
				%18 = add i32 %17, %16
				%19 = shl i32 %18, 4
				%20 = extractelement <3 x i32> %7, i32 1
				%21 = shl i32 %20, 2
				%22 = sext i32 %21 to i64
				%23 = getelementptr i8, i8 addrspace(3)* null, i64 %22
				%24 = bitcast i8 addrspace(3)* %23 to i32 addrspace(3)*
				%25 = load i32, i32 addrspace(3)* %24, align 4
				%26 = extractelement <512 x float> %14, i32 %25
				%27 = insertelement <4 x float> undef, float %26, i32 0
				call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %27, <4 x i32> %9, i32 0, i32 %19, i1 false, i1 false)
				ret void
				}

				declare void @llvm.amdgcn.s.barrier() #1

				declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2

				attributes #0 = { "amdgpu-max-work-group-size"="1024" }
				attributes #1 = { convergent nounwind }
				attributes #2 = { nounwind }

				!0 = !{!1, !1, i64 0, i32 1}
				!1 = !{!"const", null}

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 53719

include/llvm/Support/MathExtras.h

lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

lib/Target/AMDGPU/SIMachineFunctionInfo.h

lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

lib/Target/AMDGPU/SIRegisterInfo.cpp

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll

test/CodeGen/AMDGPU/large-work-group-registers.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: allow specifying a workgroup size that needs to fit in a compute unitClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 53719

include/llvm/Support/MathExtras.h

lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

lib/Target/AMDGPU/SIMachineFunctionInfo.h

lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

lib/Target/AMDGPU/SIRegisterInfo.cpp

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll

test/CodeGen/AMDGPU/large-work-group-registers.ll

AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
ClosedPublic