Diff 67801

lib/Target/AMDGPU/AMDGPUAsmPrinter.h

Show All 34 Lines	SIProgramInfo() :
ScratchSize(0),		ScratchSize(0),
ComputePGMRSrc1(0),		ComputePGMRSrc1(0),
LDSBlocks(0),		LDSBlocks(0),
ScratchBlocks(0),		ScratchBlocks(0),
ComputePGMRSrc2(0),		ComputePGMRSrc2(0),
NumVGPR(0),		NumVGPR(0),
NumSGPR(0),		NumSGPR(0),
FlatUsed(false),		FlatUsed(false),
		NumSGPRsForWavesPerEU(0),
		NumVGPRsForWavesPerEU(0),
ReservedVGPRFirst(0),		ReservedVGPRFirst(0),
ReservedVGPRCount(0),		ReservedVGPRCount(0),
DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1),		DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1),
DebuggerPrivateSegmentBufferSGPR((uint16_t)-1),		DebuggerPrivateSegmentBufferSGPR((uint16_t)-1),
VCCUsed(false),		VCCUsed(false),
CodeLen(0) {}		CodeLen(0) {}

// Fields set in PGM_RSRC1 pm4 packet.		// Fields set in PGM_RSRC1 pm4 packet.
Show All 15 Lines	struct SIProgramInfo {

uint64_t ComputePGMRSrc2;		uint64_t ComputePGMRSrc2;

uint32_t NumVGPR;		uint32_t NumVGPR;
uint32_t NumSGPR;		uint32_t NumSGPR;
uint32_t LDSSize;		uint32_t LDSSize;
bool FlatUsed;		bool FlatUsed;

		// Number of SGPRs that meets number of waves per execution unit request.
		uint32_t NumSGPRsForWavesPerEU;

		// Number of VGPRs that meets number of waves per execution unit request.
		arsenmUnsubmitted Done Reply Inline Actions Line before comment arsenm: Line before comment
		uint32_t NumVGPRsForWavesPerEU;

// If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first		// If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first
// fixed VGPR number reserved.		// fixed VGPR number reserved.
uint16_t ReservedVGPRFirst;		uint16_t ReservedVGPRFirst;

// The number of consecutive VGPRs reserved.		// The number of consecutive VGPRs reserved.
uint16_t ReservedVGPRCount;		uint16_t ReservedVGPRCount;

// Fixed SGPR number used to hold wave scratch offset for entire kernel		// Fixed SGPR number used to hold wave scratch offset for entire kernel
// execution, or uint16_t(-1) if the register is not used or not known.		// execution, or uint16_t(-1) if the register is not used or not known.
uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR;		uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR;

// Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire		// Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
// kernel execution, or uint16_t(-1) if the register is not used or not		// kernel execution, or uint16_t(-1) if the register is not used or not
// known.		// known.
uint16_t DebuggerPrivateSegmentBufferSGPR;		uint16_t DebuggerPrivateSegmentBufferSGPR;

// Bonus information for debugging.		// Bonus information for debugging.
bool VCCUsed;		bool VCCUsed;
uint64_t CodeLen;		uint64_t CodeLen;
▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Show First 20 Lines • Show All 196 Lines • ▼ Show 20 Lines	if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
false);		false);
OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),		OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
false);		false);
OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),		OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
false);		false);
OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +		OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
" bytes/workgroup (compile time only)", false);		" bytes/workgroup (compile time only)", false);

		OutStreamer->emitRawComment(" SGPRBlocks: " +
		Twine(KernelInfo.SGPRBlocks), false);
		OutStreamer->emitRawComment(" VGPRBlocks: " +
		Twine(KernelInfo.VGPRBlocks), false);

		OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " +
		Twine(KernelInfo.NumSGPRsForWavesPerEU), false);
		OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " +
		Twine(KernelInfo.NumVGPRsForWavesPerEU), false);

OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),		OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
false);		false);
OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),		OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
false);		false);

if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {		if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +		OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);		Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
▲ Show 20 Lines • Show All 228 Lines • ▼ Show 20 Lines	void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
} else {		} else {
if (STM.isXNACKEnabled())		if (STM.isXNACKEnabled())
ExtraSGPRs = 4;		ExtraSGPRs = 4;

if (FlatUsed)		if (FlatUsed)
ExtraSGPRs = 6;		ExtraSGPRs = 6;
}		}

MaxSGPR += ExtraSGPRs;

// Record first reserved register and reserved register count fields, and		// Record first reserved register and reserved register count fields, and
// update max register counts if "amdgpu-debugger-reserve-regs" attribute was		// update max register counts if "amdgpu-debugger-reserve-regs" attribute was
// specified.		// requested.
if (STM.debuggerReserveRegs()) {		ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0;
ProgInfo.ReservedVGPRFirst = MaxVGPR + 1;		ProgInfo.ReservedVGPRCount = RI->getNumDebuggerReservedVGPRs(STM);
ProgInfo.ReservedVGPRCount = MFI->getDebuggerReservedVGPRCount();
MaxVGPR += MFI->getDebuggerReservedVGPRCount();
}

// Update DebuggerWavefrontPrivateSegmentOffsetSGPR and		// Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
// DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"		// DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
// attribute was specified.		// attribute was requested.
if (STM.debuggerEmitPrologue()) {		if (STM.debuggerEmitPrologue()) {
ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =		ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());		RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
ProgInfo.DebuggerPrivateSegmentBufferSGPR =		ProgInfo.DebuggerPrivateSegmentBufferSGPR =
RI->getHWRegIndex(MFI->getScratchRSrcReg());		RI->getHWRegIndex(MFI->getScratchRSrcReg());
}		}

		// Account for extra SGPRs and VGPRs reserved for debugger use.
		MaxSGPR += ExtraSGPRs;
		MaxVGPR += RI->getNumDebuggerReservedVGPRs(STM);

// We found the maximum register index. They start at 0, so add one to get the		// We found the maximum register index. They start at 0, so add one to get the
// number of registers.		// number of registers.
ProgInfo.NumVGPR = MaxVGPR + 1;		ProgInfo.NumVGPR = MaxVGPR + 1;
ProgInfo.NumSGPR = MaxSGPR + 1;		ProgInfo.NumSGPR = MaxSGPR + 1;

		// Adjust number of registers used to meet default/requested minimum/maximum
		// number of waves per execution unit request.
		ProgInfo.NumSGPRsForWavesPerEU = std::max(
		ProgInfo.NumSGPR, RI->getMinNumSGPRs(STM, MFI->getMaxWavesPerEU()));
		ProgInfo.NumVGPRsForWavesPerEU = std::max(
		ProgInfo.NumVGPR, RI->getMinNumVGPRs(MFI->getMaxWavesPerEU()));

if (STM.hasSGPRInitBug()) {		if (STM.hasSGPRInitBug()) {
if (ProgInfo.NumSGPR > SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) {		if (ProgInfo.NumSGPR > SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) {
LLVMContext &Ctx = MF.getFunction()->getContext();		LLVMContext &Ctx = MF.getFunction()->getContext();
DiagnosticInfoResourceLimit Diag(*MF.getFunction(),		DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
"SGPRs with SGPR init bug",		"SGPRs with SGPR init bug",
ProgInfo.NumSGPR, DS_Error);		ProgInfo.NumSGPR, DS_Error);
Ctx.diagnose(Diag);		Ctx.diagnose(Diag);
}		}

ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;		ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
		ProgInfo.NumSGPRsForWavesPerEU = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
}		}

if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {		if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
LLVMContext &Ctx = MF.getFunction()->getContext();		LLVMContext &Ctx = MF.getFunction()->getContext();
DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs",		DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs",
MFI->NumUserSGPRs, DS_Error);		MFI->NumUserSGPRs, DS_Error);
Ctx.diagnose(Diag);		Ctx.diagnose(Diag);
}		}

if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {		if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
LLVMContext &Ctx = MF.getFunction()->getContext();		LLVMContext &Ctx = MF.getFunction()->getContext();
DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory",		DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory",
MFI->getLDSSize(), DS_Error);		MFI->getLDSSize(), DS_Error);
Ctx.diagnose(Diag);		Ctx.diagnose(Diag);
}		}

ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;		// SGPRBlocks is actual number of SGPR blocks minus 1.
ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;		ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU,
		RI->getSGPRAllocGranule());
		ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / RI->getSGPRAllocGranule() - 1;

		arsenmUnsubmitted Done Reply Inline Actions Line before comment arsenm: Line before comment
		// VGPRBlocks is actual number of VGPR blocks minus 1.
		ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU,
		RI->getVGPRAllocGranule());
		ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / RI->getVGPRAllocGranule() - 1;

// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode		// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
// register.		// register.
ProgInfo.FloatMode = getFPMode(MF);		ProgInfo.FloatMode = getFPMode(MF);

ProgInfo.IEEEMode = 0;		ProgInfo.IEEEMode = 0;

// Make clamp modifier on NaN input returns 0.		// Make clamp modifier on NaN input returns 0.
ProgInfo.DX10Clamp = 1;		ProgInfo.DX10Clamp = 1;
Show All 9 Lines	void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {		if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
// LDS is allocated in 64 dword blocks.		// LDS is allocated in 64 dword blocks.
LDSAlignShift = 8;		LDSAlignShift = 8;
} else {		} else {
// LDS is allocated in 128 dword blocks.		// LDS is allocated in 128 dword blocks.
LDSAlignShift = 9;		LDSAlignShift = 9;
}		}

unsigned LDSSpillSize = MFI->LDSWaveSpillSize *		unsigned LDSSpillSize =
MFI->getMaximumWorkGroupSize(MF);		MFI->LDSWaveSpillSize * MFI->getMaxFlatWorkGroupSize();

ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;		ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
ProgInfo.LDSBlocks =		ProgInfo.LDSBlocks =
alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;		alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;

// Scratch is allocated in 256 dword blocks.		// Scratch is allocated in 256 dword blocks.
unsigned ScratchAlignShift = 10;		unsigned ScratchAlignShift = 10;
// We need to program the hardware with the amount of scratch memory that		// We need to program the hardware with the amount of scratch memory that
▲ Show 20 Lines • Show All 448 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Show First 20 Lines • Show All 178 Lines • ▼ Show 20 Lines	bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {

unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);		unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);

// Restrict local memory usage so that we don't drastically reduce occupancy,		// Restrict local memory usage so that we don't drastically reduce occupancy,
// unless it is already significantly reduced.		// unless it is already significantly reduced.

// TODO: Have some sort of hint or other heuristics to guess occupancy based		// TODO: Have some sort of hint or other heuristics to guess occupancy based
// on other factors..		// on other factors..
unsigned OccupancyHint		unsigned OccupancyHint = ST.getWavesPerEU(F).second;
= AMDGPU::getIntegerAttribute(F, "amdgpu-max-waves-per-eu", 0);
if (OccupancyHint == 0)		if (OccupancyHint == 0)
OccupancyHint = 7;		OccupancyHint = 7;

// Clamp to max value.		// Clamp to max value.
OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerCU());		OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());

// Check the hint but ignore it if it's obviously wrong from the existing LDS		// Check the hint but ignore it if it's obviously wrong from the existing LDS
// usage.		// usage.
MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);		MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);


// Round up to the next tier of usage.		// Round up to the next tier of usage.
unsigned MaxSizeWithWaveCount		unsigned MaxSizeWithWaveCount
▲ Show 20 Lines • Show All 443 Lines • ▼ Show 20 Lines	void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
const Function &ContainingFunction = *I.getParent()->getParent();		const Function &ContainingFunction = *I.getParent()->getParent();

// Don't promote the alloca to LDS for shader calling conventions as the work		// Don't promote the alloca to LDS for shader calling conventions as the work
// item ID intrinsics are not supported for these calling conventions.		// item ID intrinsics are not supported for these calling conventions.
// Furthermore not all LDS is available for some of the stages.		// Furthermore not all LDS is available for some of the stages.
if (AMDGPU::isShader(ContainingFunction.getCallingConv()))		if (AMDGPU::isShader(ContainingFunction.getCallingConv()))
return;		return;

		const AMDGPUSubtarget &ST =
		TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
// FIXME: We should also try to get this value from the reqd_work_group_size		// FIXME: We should also try to get this value from the reqd_work_group_size
// function attribute if it is available.		// function attribute if it is available.
unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);		unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;

const DataLayout &DL = Mod->getDataLayout();		const DataLayout &DL = Mod->getDataLayout();

unsigned Align = I.getAlignment();		unsigned Align = I.getAlignment();
if (Align == 0)		if (Align == 0)
Align = DL.getABITypeAlignment(I.getAllocatedType());		Align = DL.getABITypeAlignment(I.getAllocatedType());

// FIXME: This computed padding is likely wrong since it depends on inverse		// FIXME: This computed padding is likely wrong since it depends on inverse
▲ Show 20 Lines • Show All 171 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUSubtarget.h

Show First 20 Lines • Show All 264 Lines • ▼ Show 20 Lines	public:
bool hasUnalignedBufferAccess() const {		bool hasUnalignedBufferAccess() const {
return UnalignedBufferAccess;		return UnalignedBufferAccess;
}		}

bool isXNACKEnabled() const {		bool isXNACKEnabled() const {
return EnableXNACK;		return EnableXNACK;
}		}

unsigned getMaxWavesPerCU() const {
if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return 10;

// FIXME: Not sure what this is for other subtagets.
return 8;
}

/// \brief Returns the offset in bytes from the start of the input buffer		/// \brief Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.		/// of the first explicit kernel argument.
unsigned getExplicitKernelArgOffset() const {		unsigned getExplicitKernelArgOffset() const {
return isAmdHsaOS() ? 0 : 36;		return isAmdHsaOS() ? 0 : 36;
}		}

unsigned getStackAlignment() const {		unsigned getStackAlignment() const {
// Scratch is allocated in 256 dword per wave blocks.		// Scratch is allocated in 256 dword per wave blocks.
return 4 * 256 / getWavefrontSize();		return 4 * 256 / getWavefrontSize();
}		}

bool enableMachineScheduler() const override {		bool enableMachineScheduler() const override {
return true;		return true;
}		}

bool enableSubRegLiveness() const override {		bool enableSubRegLiveness() const override {
return true;		return true;
}		}

		/// \returns Number of execution units per compute unit supported by the
		/// subtarget.
		unsigned getEUsPerCU() const {
		return 4;
		}

		/// \returns Maximum number of work groups per compute unit supported by the
		/// subtarget and limited by given flat work group size.
		unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
		if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
		return 8;
		return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16;
		}

		/// \returns Maximum number of waves per compute unit supported by the
		/// subtarget without any kind of limitation.
		unsigned getMaxWavesPerCU() const {
		return getMaxWavesPerEU() * getEUsPerCU();
		}

		/// \returns Maximum number of waves per compute unit supported by the
		/// subtarget and limited by given flat work group size.
		unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
		unsigned WavesPerWorkGroup = getWavesPerWorkGroup(FlatWorkGroupSize);
		unsigned MaxWorkGroupsPerCU = getMaxWorkGroupsPerCU(FlatWorkGroupSize);
		unsigned MaxWavesPerCU = WavesPerWorkGroup * MaxWorkGroupsPerCU;
		MaxWavesPerCU = std::min(MaxWavesPerCU, getMaxWavesPerCU());
		MaxWavesPerCU = alignDown(MaxWavesPerCU, WavesPerWorkGroup);
		MaxWavesPerCU = MaxWavesPerCU / WavesPerWorkGroup;
		MaxWavesPerCU = MaxWavesPerCU * WavesPerWorkGroup;
		return MaxWavesPerCU;
		}

		/// \returns Minimum number of waves per execution unit supported by the
		/// subtarget.
		unsigned getMinWavesPerEU() const {
		return 1;
		}

		/// \returns Maximum number of waves per execution unit supported by the
		/// subtarget without any kind of limitation.
		unsigned getMaxWavesPerEU() const {
		if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
		return 8;
		// FIXME: Need to take scratch memory into account.
		return 10;
		}

		/// \returns Maximum number of waves per execution unit supported by the
		/// subtarget and limited by given flat work group size.
		unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
		unsigned MaxWavesPerCU = getMaxWavesPerCU(FlatWorkGroupSize);
		unsigned MaxWavesPerEU = alignDown(MaxWavesPerCU, getEUsPerCU());
		MaxWavesPerEU = MaxWavesPerEU / getEUsPerCU();
		return MaxWavesPerEU;
		}

		/// \returns Minimum flat work group size supported by the subtarget.
		unsigned getMinFlatWorkGroupSize() const {
		return 1;
		}

		/// \returns Maximum flat work group size supported by the subtarget.
		unsigned getMaxFlatWorkGroupSize() const {
		return 2048;
		}

		/// \returns Number of waves per work group given the flat work group size.
		unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
		return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize();
		}

		/// \returns Subtarget's default pair of minimum/maximum flat work group sizes
		/// for function \p F, or minimum/maximum flat work group sizes explicitly
		/// requested using "amdgpu-flat-work-group-size" attribute attached to
		/// function \p F.
		///
		/// \returns Subtarget's default values if explicitly requested values cannot
		/// be converted to integer, or violate subtarget's specifications.
		std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;

		/// \returns Subtarget's default pair of minimum/maximum number of waves per
		/// execution unit for function \p F, or minimum/maximum number of waves per
		/// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
		/// attached to function \p F.
		///
		/// \returns Subtarget's default values if explicitly requested values cannot
		/// be converted to integer, violate subtarget's specifications, or are not
		/// compatible with minimum/maximum number of waves limited by flat work group
		/// size, register usage, and/or lds usage.
		std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
};		};

class R600Subtarget final : public AMDGPUSubtarget {		class R600Subtarget final : public AMDGPUSubtarget {
private:		private:
R600InstrInfo InstrInfo;		R600InstrInfo InstrInfo;
R600FrameLowering FrameLowering;		R600FrameLowering FrameLowering;
R600TargetLowering TLInfo;		R600TargetLowering TLInfo;

▲ Show 20 Lines • Show All 121 Lines • ▼ Show 20 Lines	public:

bool hasSGPRInitBug() const {		bool hasSGPRInitBug() const {
return SGPRInitBug;		return SGPRInitBug;
}		}
};		};

} // End namespace llvm		} // End namespace llvm

#endif		#endif
		kzhuravlUnsubmitted Done Reply Inline Actions The reason I put this function here is because it either returns flat work group size for a given function or subtarget's defaults, but I am not sure if this is the right place. kzhuravl: The reason I put this function here is because it either returns flat work group size for a…
		arsenmUnsubmitted Done Reply Inline Actions We probably shouldn't be using IR classes in the subtarget, since there's a desire to someday be able to throw away the IR during codegen, but since there's already one thing already doing this here is fine for now arsenm: We probably shouldn't be using IR classes in the subtarget, since there's a desire to someday…
		kzhuravlUnsubmitted Done Reply Inline Actions The reason I put this function here is because it either returns number of active waves per execution unit for a given function or subtarget's defaults, but I am not sure if this is the right place. kzhuravl: The reason I put this function here is because it either returns number of active waves per…

lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Show First 20 Lines • Show All 173 Lines • ▼ Show 20 Lines	if (Bytes <= 5461)
return 3;		return 3;

if (Bytes <= 8192)		if (Bytes <= 8192)
return 2;		return 2;

return 1;		return 1;
}		}

		std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
		const Function &F) const {

		// Default minimum/maximum flat work group sizes.
		std::pair<unsigned, unsigned> Default =
		AMDGPU::isCompute(F.getCallingConv()) ?
		std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
		getWavefrontSize() * 4) :
		std::pair<unsigned, unsigned>(1, getWavefrontSize());

		// TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
		// starts using "amdgpu-flat-work-group-size" attribute.
		Default.second = AMDGPU::getIntegerAttribute(
		F, "amdgpu-max-work-group-size", Default.second);
		Default.first = std::min(Default.first, Default.second);

		// Requested minimum/maximum flat work group sizes.
		std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
		F, "amdgpu-flat-work-group-size", Default);

		// Make sure requested minimum is less than requested maximum.
		if (Requested.first > Requested.second)
		return Default;

		// Make sure requested values do not violate subtarget's specifications.
		if (Requested.first < getMinFlatWorkGroupSize())
		return Default;
		if (Requested.second > getMaxFlatWorkGroupSize())
		return Default;

		return Requested;
		arsenmUnsubmitted Done Reply Inline Actions You can use those as the initializer value instead of the = arsenm: You can use those as the initializer value instead of the =
		}

		std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
		const Function &F) const {

		// Default minimum/maximum number of waves per execution unit.
		std::pair<unsigned, unsigned> Default(1, 0);

		// Default/requested minimum/maximum flat work group sizes.
		std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);

		// If minimum/maximum flat work group sizes were explicitly requested using
		// "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
		// number of waves per execution unit to values implied by requested
		// minimum/maximum flat work group sizes.
		unsigned ImpliedByMinFlatWorkGroupSize =
		getMaxWavesPerEU(FlatWorkGroupSizes.first);
		unsigned ImpliedByMaxFlatWorkGroupSize =
		getMaxWavesPerEU(FlatWorkGroupSizes.second);
		unsigned MinImpliedByFlatWorkGroupSize =
		std::min(ImpliedByMinFlatWorkGroupSize, ImpliedByMaxFlatWorkGroupSize);
		unsigned MaxImpliedByFlatWorkGroupSize =
		std::max(ImpliedByMinFlatWorkGroupSize, ImpliedByMaxFlatWorkGroupSize);

		// TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
		// starts using "amdgpu-flat-work-group-size" attribute.
		if (F.hasFnAttribute("amdgpu-max-work-group-size") \|\|
		F.hasFnAttribute("amdgpu-flat-work-group-size")) {
		Default.first = MinImpliedByFlatWorkGroupSize;
		Default.second = MaxImpliedByFlatWorkGroupSize;
		}

		// Requested minimum/maximum number of waves per execution unit.
		std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
		F, "amdgpu-waves-per-eu", Default, true);

		// Make sure requested minimum is less than requested maximum.
		if (Requested.second && Requested.first > Requested.second)
		return Default;

		// Make sure requested values do not violate subtarget's specifications.
		if (Requested.first < getMinWavesPerEU() \|\|
		Requested.first > getMaxWavesPerEU())
		return Default;
		if (Requested.second > getMaxWavesPerEU())
		return Default;

		// Make sure requested values are compatible with values implied by requested
		// minimum/maximum flat work group sizes.
		if (Requested.first > MinImpliedByFlatWorkGroupSize \|\|
		Requested.second > MaxImpliedByFlatWorkGroupSize)
		return Default;

		return Requested;
		}

R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,		R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
const TargetMachine &TM) :		const TargetMachine &TM) :
AMDGPUSubtarget(TT, GPU, FS, TM),		AMDGPUSubtarget(TT, GPU, FS, TM),
InstrInfo(*this),		InstrInfo(*this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),		FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
TLInfo(TM, *this) {}		TLInfo(TM, *this) {}

SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,		SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
Show All 18 Lines	void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,

// Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.		// Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
if (!enableSIScheduler())		if (!enableSIScheduler())
Policy.ShouldTrackLaneMasks = true;		Policy.ShouldTrackLaneMasks = true;
}		}

bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {		bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
return EnableVGPRSpilling \|\| !AMDGPU::isShader(F.getCallingConv());		return EnableVGPRSpilling \|\| !AMDGPU::isShader(F.getCallingConv());
}		}
		arsenmUnsubmitted Done Reply Inline Actions This should use DiagnosticInfoUnsupported. Although on second thought these should probably just clamp and diagnosis in clang arsenm: This should use DiagnosticInfoUnsupported. Although on second thought these should probably…
		arsenmUnsubmitted Done Reply Inline Actions Ditto arsenm: Ditto
		arsenmUnsubmitted Done Reply Inline Actions Ditto arsenm: Ditto
		arsenmUnsubmitted Done Reply Inline Actions More DiagnosticInfos. These should also provide the function so clang will be able to provide a more useful debug location arsenm: More DiagnosticInfos. These should also provide the function so clang will be able to provide a…
		kzhuravlUnsubmitted Not Done Reply Inline Actions Changed to clamp kzhuravl: Changed to clamp
		arsenmUnsubmitted Done Reply Inline Actions Can you put blank lines after code before a comment, it's hard to read with it this dense arsenm: Can you put blank lines after code before a comment, it's hard to read with it this dense

lib/Target/AMDGPU/SIInstrInfo.cpp

	Show All 22 Lines
	#include "llvm/CodeGen/ScheduleDAG.h"			#include "llvm/CodeGen/ScheduleDAG.h"
	#include "llvm/IR/Function.h"			#include "llvm/IR/Function.h"
	#include "llvm/CodeGen/RegisterScavenging.h"			#include "llvm/CodeGen/RegisterScavenging.h"
	#include "llvm/MC/MCInstrDesc.h"			#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/Support/Debug.h"			#include "llvm/Support/Debug.h"

	using namespace llvm;			using namespace llvm;

	SIInstrInfo::SIInstrInfo(const SISubtarget &ST)			SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
	: AMDGPUInstrInfo(ST), RI(), ST(ST) {}			: AMDGPUInstrInfo(ST), RI(), ST(ST) {}
				arsenmUnsubmitted Done Reply Inline Actions This doesn't look right arsenm: This doesn't look right

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// TargetInstrInfo callbacks			// TargetInstrInfo callbacks
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	static unsigned getNumOperandsNoGlue(SDNode *Node) {			static unsigned getNumOperandsNoGlue(SDNode *Node) {
	unsigned N = Node->getNumOperands();			unsigned N = Node->getNumOperands();
	while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)			while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
	▲ Show 20 Lines • Show All 683 Lines • ▼ Show 20 Lines
	unsigned SIInstrInfo::calculateLDSSpillAddress(			unsigned SIInstrInfo::calculateLDSSpillAddress(
	MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,			MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
	unsigned FrameOffset, unsigned Size) const {			unsigned FrameOffset, unsigned Size) const {
	MachineFunction *MF = MBB.getParent();			MachineFunction *MF = MBB.getParent();
	SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();			SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
	const SISubtarget &ST = MF->getSubtarget<SISubtarget>();			const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
	const SIRegisterInfo *TRI = ST.getRegisterInfo();			const SIRegisterInfo *TRI = ST.getRegisterInfo();
	DebugLoc DL = MBB.findDebugLoc(MI);			DebugLoc DL = MBB.findDebugLoc(MI);
	unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF);			unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
	unsigned WavefrontSize = ST.getWavefrontSize();			unsigned WavefrontSize = ST.getWavefrontSize();

	unsigned TIDReg = MFI->getTIDReg();			unsigned TIDReg = MFI->getTIDReg();
	if (!MFI->hasCalculatedTID()) {			if (!MFI->hasCalculatedTID()) {
	MachineBasicBlock &Entry = MBB.getParent()->front();			MachineBasicBlock &Entry = MBB.getParent()->front();
	MachineBasicBlock::iterator Insert = Entry.front();			MachineBasicBlock::iterator Insert = Entry.front();
	DebugLoc DL = Insert->getDebugLoc();			DebugLoc DL = Insert->getDebugLoc();

	▲ Show 20 Lines • Show All 2,475 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIMachineFunctionInfo.h

Show First 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
unsigned WorkGroupIDZSystemSGPR;		unsigned WorkGroupIDZSystemSGPR;
unsigned WorkGroupInfoSystemSGPR;		unsigned WorkGroupInfoSystemSGPR;
unsigned PrivateSegmentWaveByteOffsetSystemSGPR;		unsigned PrivateSegmentWaveByteOffsetSystemSGPR;

// Graphics info.		// Graphics info.
unsigned PSInputAddr;		unsigned PSInputAddr;
bool ReturnsVoid;		bool ReturnsVoid;

unsigned MaximumWorkGroupSize;		// A pair of default/requested minimum/maximum flat work group sizes.
		// Minimum - first, maximum - second.
		std::pair<unsigned, unsigned> FlatWorkGroupSizes;

		// A pair of default/requested minimum/maximum number of waves per execution
		// unit. Minimum - first, maximum - second.
		std::pair<unsigned, unsigned> WavesPerEU;

// Number of reserved VGPRs for debugger usage.
unsigned DebuggerReservedVGPRCount;
// Stack object indices for work group IDs.		// Stack object indices for work group IDs.
std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices;		std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices;
// Stack object indices for work item IDs.		// Stack object indices for work item IDs.
		arsenmUnsubmitted Done Reply Inline Actions Separating line arsenm: Separating line
std::array<int, 3> DebuggerWorkItemIDStackObjectIndices;		std::array<int, 3> DebuggerWorkItemIDStackObjectIndices;

public:		public:
// FIXME: Make private		// FIXME: Make private
unsigned LDSWaveSpillSize;		unsigned LDSWaveSpillSize;
unsigned PSInputEna;		unsigned PSInputEna;
std::map<unsigned, unsigned> LaneVGPRs;		std::map<unsigned, unsigned> LaneVGPRs;
unsigned ScratchOffsetReg;		unsigned ScratchOffsetReg;
▲ Show 20 Lines • Show All 260 Lines • ▼ Show 20 Lines	public:
bool returnsVoid() const {		bool returnsVoid() const {
return ReturnsVoid;		return ReturnsVoid;
}		}

void setIfReturnsVoid(bool Value) {		void setIfReturnsVoid(bool Value) {
ReturnsVoid = Value;		ReturnsVoid = Value;
}		}

/// \returns Number of reserved VGPRs for debugger usage.		/// \returns A pair of default/requested minimum/maximum flat work group sizes
unsigned getDebuggerReservedVGPRCount() const {		/// for this function.
return DebuggerReservedVGPRCount;		std::pair<unsigned, unsigned> getFlatWorkGroupSizes() const {
		return FlatWorkGroupSizes;
		}

		/// \returns Default/requested minimum flat work group size for this function.
		unsigned getMinFlatWorkGroupSize() const {
		return FlatWorkGroupSizes.first;
		}

		/// \returns Default/requested maximum flat work group size for this function.
		unsigned getMaxFlatWorkGroupSize() const {
		return FlatWorkGroupSizes.second;
		}

		/// \returns A pair of default/requested minimum/maximum number of waves per
		/// execution unit.
		std::pair<unsigned, unsigned> getWavesPerEU() const {
		return WavesPerEU;
		}

		/// \returns Default/requested minimum number of waves per execution unit.
		unsigned getMinWavesPerEU() const {
		return WavesPerEU.first;
		}

		/// \returns Default/requested maximum number of waves per execution unit.
		unsigned getMaxWavesPerEU() const {
		return WavesPerEU.second;
		arsenmUnsubmitted Done Reply Inline Actions The Returns should be \returns to doxygenify, same for the rest of these arsenm: The Returns should be \returns to doxygenify, same for the rest of these
}		}

/// \returns Stack object index for \p Dim's work group ID.		/// \returns Stack object index for \p Dim's work group ID.
int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const {		int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const {
assert(Dim < 3);		assert(Dim < 3);
return DebuggerWorkGroupIDStackObjectIndices[Dim];		return DebuggerWorkGroupIDStackObjectIndices[Dim];
}		}

▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	case 1:
assert(hasWorkItemIDY());		assert(hasWorkItemIDY());
return AMDGPU::VGPR1;		return AMDGPU::VGPR1;
case 2:		case 2:
assert(hasWorkItemIDZ());		assert(hasWorkItemIDZ());
return AMDGPU::VGPR2;		return AMDGPU::VGPR2;
}		}
llvm_unreachable("unexpected dimension");		llvm_unreachable("unexpected dimension");
}		}

unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
};		};

} // End namespace llvm		} // End namespace llvm

#endif		#endif

lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines	: AMDGPUMachineFunction(MF),
GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),		GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),		WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),		WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),		WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),		WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),		PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
PSInputAddr(0),		PSInputAddr(0),
ReturnsVoid(true),		ReturnsVoid(true),
MaximumWorkGroupSize(0),		FlatWorkGroupSizes(0, 0),
DebuggerReservedVGPRCount(0),		WavesPerEU(0, 0),
DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}),		DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}),
DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}),		DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}),
LDSWaveSpillSize(0),		LDSWaveSpillSize(0),
		arsenmUnsubmitted Done Reply Inline Actions I think just 0, 0 or 0u, 0u should work arsenm: I think just 0, 0 or 0u, 0u should work
PSInputEna(0),		PSInputEna(0),
NumUserSGPRs(0),		NumUserSGPRs(0),
NumSystemSGPRs(0),		NumSystemSGPRs(0),
HasSpilledSGPRs(false),		HasSpilledSGPRs(false),
HasSpilledVGPRs(false),		HasSpilledVGPRs(false),
HasNonSpillStackObjects(false),		HasNonSpillStackObjects(false),
NumSpilledSGPRs(0),		NumSpilledSGPRs(0),
NumSpilledVGPRs(0),		NumSpilledVGPRs(0),
▲ Show 20 Lines • Show All 66 Lines • ▼ Show 20 Lines	SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)

// We don't need to worry about accessing spills with flat instructions.		// We don't need to worry about accessing spills with flat instructions.
// TODO: On VI where we must use flat for global, we should be able to omit		// TODO: On VI where we must use flat for global, we should be able to omit
// this if it is never used for generic access.		// this if it is never used for generic access.
if (HasStackObjects && ST.getGeneration() >= SISubtarget::SEA_ISLANDS &&		if (HasStackObjects && ST.getGeneration() >= SISubtarget::SEA_ISLANDS &&
ST.isAmdHsaOS())		ST.isAmdHsaOS())
FlatScratchInit = true;		FlatScratchInit = true;

if (AMDGPU::isCompute(F->getCallingConv()))		FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F);		WavesPerEU = ST.getWavesPerEU(*F);
else
MaximumWorkGroupSize = ST.getWavefrontSize();

if (ST.debuggerReserveRegs())
DebuggerReservedVGPRCount = 4;
}		}

unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(		unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
const SIRegisterInfo &TRI) {		const SIRegisterInfo &TRI) {
PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(		PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);		getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
NumUserSGPRs += 4;		NumUserSGPRs += 4;
return PrivateSegmentBufferUserSGPR;		return PrivateSegmentBufferUserSGPR;
▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines	for (MachineFunction::iterator BI = MF->begin(), BE = MF->end();
BI != BE; ++BI) {		BI != BE; ++BI) {
BI->addLiveIn(LaneVGPR);		BI->addLiveIn(LaneVGPR);
}		}
}		}

Spill.VGPR = LaneVGPRs[LaneVGPRIdx];		Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
return Spill;		return Spill;
}		}

unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
const MachineFunction &MF) const {
return MaximumWorkGroupSize;
}

lib/Target/AMDGPU/SIRegisterInfo.h

Show All 19 Lines

namespace llvm {		namespace llvm {

class SISubtarget;		class SISubtarget;
class MachineRegisterInfo;		class MachineRegisterInfo;

struct SIRegisterInfo final : public AMDGPURegisterInfo {		struct SIRegisterInfo final : public AMDGPURegisterInfo {
private:		private:
unsigned SGPR32SetID;		unsigned SGPR32SetID;
		arsenmUnsubmitted Done Reply Inline Actions I don't think the RegisterInfo should have a reference to the Subtarget arsenm: I don't think the RegisterInfo should have a reference to the Subtarget
unsigned VGPR32SetID;		unsigned VGPR32SetID;
BitVector SGPRPressureSets;		BitVector SGPRPressureSets;
BitVector VGPRPressureSets;		BitVector VGPRPressureSets;

void reserveRegisterTuples(BitVector &, unsigned Reg) const;		void reserveRegisterTuples(BitVector &, unsigned Reg) const;
void classifyPressureSet(unsigned PSetID, unsigned Reg,		void classifyPressureSet(unsigned PSetID, unsigned Reg,
BitVector &PressureSets) const;		BitVector &PressureSets) const;

▲ Show 20 Lines • Show All 128 Lines • ▼ Show 20 Lines	enum PreloadedValue {
WORKITEM_ID_Y = 16,		WORKITEM_ID_Y = 16,
WORKITEM_ID_Z = 17		WORKITEM_ID_Z = 17
};		};

/// \brief Returns the physical register that \p Value is stored in.		/// \brief Returns the physical register that \p Value is stored in.
unsigned getPreloadedValue(const MachineFunction &MF,		unsigned getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const;		enum PreloadedValue Value) const;

/// \brief Give the maximum number of VGPRs that can be used by \p WaveCount
/// concurrent waves.
unsigned getNumVGPRsAllowed(unsigned WaveCount) const;

/// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
/// concurrent waves.
unsigned getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const;

unsigned findUnusedRegister(const MachineRegisterInfo &MRI,		unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC,		const TargetRegisterClass *RC,
const MachineFunction &MF) const;		const MachineFunction &MF) const;

unsigned getSGPR32PressureSet() const { return SGPR32SetID; };		unsigned getSGPR32PressureSet() const { return SGPR32SetID; };
unsigned getVGPR32PressureSet() const { return VGPR32SetID; };		unsigned getVGPR32PressureSet() const { return VGPR32SetID; };

bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;		bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;

		/// \returns SGPR allocation granularity supported by the subtarget.
		unsigned getSGPRAllocGranule() const {
		return 8;
		}

		/// \returns Total number of SGPRs supported by the subtarget.
		arsenmUnsubmitted Done Reply Inline Actions More \returns arsenm: More \returns
		unsigned getTotalNumSGPRs(const SISubtarget &ST) const;

		/// \returns Number of addressable SGPRs supported by the subtarget.
		unsigned getNumAddressableSGPRs(const SISubtarget &ST) const;

		/// \returns Number of reserved SGPRs supported by the subtarget.
		unsigned getNumReservedSGPRs(const SISubtarget &ST) const;

		/// \returns Minimum number of SGPRs that meets given number of waves per
		/// execution unit requirement for given subtarget.
		unsigned getMinNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU) const;

		/// \returns Maximum number of SGPRs that meets given number of waves per
		/// execution unit requirement for given subtarget.
		unsigned getMaxNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU) const;

		/// \returns Maximum number of SGPRs that meets number of waves per execution
		/// unit requirement for function \p MF, or number of SGPRs explicitly
		/// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
		///
		/// \returns Value that meets number of waves per execution unit requirement
		/// if explicitly requested value cannot be converted to integer, violates
		/// subtarget's specifications, or does not meet number of waves per execution
		/// unit requirement.
		unsigned getMaxNumSGPRs(const MachineFunction &MF) const;

		/// \returns VGPR allocation granularity supported by the subtarget.
		unsigned getVGPRAllocGranule() const {
		return 4;
		}

		/// \returns Total number of VGPRs supported by the subtarget.
		unsigned getTotalNumVGPRs() const {
		return 256;
		}

		kzhuravlUnsubmitted Done Reply Inline Actions Is there a better place for this function? kzhuravl: Is there a better place for this function?
		/// \returns Number of reserved VGPRs for debugger use supported by the
		/// subtarget.
		unsigned getNumDebuggerReservedVGPRs(const SISubtarget &ST) const;

		/// \returns Minimum number of SGPRs that meets given number of waves per
		/// execution unit requirement.
		unsigned getMinNumVGPRs(unsigned WavesPerEU) const;

		/// \returns Maximum number of VGPRs that meets given number of waves per
		/// execution unit requirement.
		unsigned getMaxNumVGPRs(unsigned WavesPerEU) const;

		/// \returns Maximum number of VGPRs that meets number of waves per execution
		/// unit requirement for function \p MF, or number of VGPRs explicitly
		/// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
		///
		/// \returns Value that meets number of waves per execution unit requirement
		/// if explicitly requested value cannot be converted to integer, violates
		/// subtarget's specifications, or does not meet number of waves per execution
		/// unit requirement.
		unsigned getMaxNumVGPRs(const MachineFunction &MF) const;

private:		private:
void buildScratchLoadStore(MachineBasicBlock::iterator MI,		void buildScratchLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp, const MachineOperand *SrcDst,		unsigned LoadStoreOp, const MachineOperand *SrcDst,
unsigned ScratchRsrcReg, unsigned ScratchOffset,		unsigned ScratchRsrcReg, unsigned ScratchOffset,
int64_t Offset,		int64_t Offset,
RegScavenger *RS) const;		RegScavenger *RS) const;
};		};

} // End namespace llvm		} // End namespace llvm

#endif		#endif
		kzhuravlUnsubmitted Done Reply Inline Actions Is there a better place for this function? kzhuravl: Is there a better place for this function?

lib/Target/AMDGPU/SIRegisterInfo.cpp

Show All 18 Lines
#include "llvm/CodeGen/MachineFrameInfo.h"		#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"		#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RegisterScavenging.h"		#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/IR/Function.h"		#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"		#include "llvm/IR/LLVMContext.h"

using namespace llvm;		using namespace llvm;

static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
unsigned SIMDPerCU = 4;

unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize();
return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) /
MaxInvocationsPerWave;
}

static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);

unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment;
unsigned ReservedSGPRCount;

if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
TotalSGPRCountPerSIMD = 800;
AddressableSGPRCount = 102;
SGPRUsageAlignment = 16;
ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK
} else {
TotalSGPRCountPerSIMD = 512;
AddressableSGPRCount = 104;
SGPRUsageAlignment = 8;
ReservedSGPRCount = 2; // VCC
}

unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD);
MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment);

if (ST.hasSGPRInitBug())
MaxSGPRCount = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;

return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount);
}

static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) {
unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
unsigned TotalVGPRCountPerSIMD = 256;
unsigned VGPRUsageAlignment = 4;

return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD,
VGPRUsageAlignment);
}

static bool hasPressureSet(const int *PSets, unsigned PSetID) {		static bool hasPressureSet(const int *PSets, unsigned PSetID) {
for (unsigned i = 0; PSets[i] != -1; ++i) {		for (unsigned i = 0; PSets[i] != -1; ++i) {
if (PSets[i] == (int)PSetID)		if (PSets[i] == (int)PSetID)
return true;		return true;
}		}
return false;		return false;
}		}

Show All 32 Lines	void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
MCRegAliasIterator R(Reg, this, true);		MCRegAliasIterator R(Reg, this, true);

for (; R.isValid(); ++R)		for (; R.isValid(); ++R)
Reserved.set(*R);		Reserved.set(*R);
}		}

unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(		unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
const MachineFunction &MF) const {		const MachineFunction &MF) const {
unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4;		unsigned BaseIdx = alignDown(getMaxNumSGPRs(MF), 4) - 4;
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));		unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);		return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
}		}

unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(		unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const {		const MachineFunction &MF) const {
unsigned RegCount = getMaxWorkGroupSGPRCount(MF);		unsigned RegCount = getMaxNumSGPRs(MF);
unsigned Reg;		unsigned Reg;

// Try to place it in a hole after PrivateSegmentbufferReg.		// Try to place it in a hole after PrivateSegmentbufferReg.
if (RegCount & 3) {		if (RegCount & 3) {
// We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to		// We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
// alignment constraints, so we have a hole where can put the wave offset.		// alignment constraints, so we have a hole where can put the wave offset.
Reg = RegCount - 1;		Reg = RegCount - 1;
} else {		} else {
Show All 18 Lines	BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::TMA);		reserveRegisterTuples(Reserved, AMDGPU::TMA);
reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);		reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);		reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);		reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);		reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);		reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);		reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);

unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF);		unsigned MaxNumSGPRs = getMaxNumSGPRs(MF);
unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF);		unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
		for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) {
unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);		unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
reserveRegisterTuples(Reserved, Reg);		reserveRegisterTuples(Reserved, Reg);
}		}

		unsigned MaxNumVGPRs = getMaxNumVGPRs(MF);
for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) {		unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
		for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);		unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
reserveRegisterTuples(Reserved, Reg);		reserveRegisterTuples(Reserved, Reg);
}		}

const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();		const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();		unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {		if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
// Reserve 1 SGPR for scratch wave offset in case we need to spill.		// Reserve 1 SGPR for scratch wave offset in case we need to spill.
reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);		reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
}		}

unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();		unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
if (ScratchRSrcReg != AMDGPU::NoRegister) {		if (ScratchRSrcReg != AMDGPU::NoRegister) {
// Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need		// Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
// to spill.		// to spill.
// TODO: May need to reserve a VGPR if doing LDS spilling.		// TODO: May need to reserve a VGPR if doing LDS spilling.
reserveRegisterTuples(Reserved, ScratchRSrcReg);		reserveRegisterTuples(Reserved, ScratchRSrcReg);
assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));		assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
}		}

// Reserve registers for debugger usage if "amdgpu-debugger-reserve-trap-regs"
// attribute was specified.
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
if (ST.debuggerReserveRegs()) {
unsigned ReservedVGPRFirst =
MaxWorkGroupVGPRCount - MFI->getDebuggerReservedVGPRCount();
for (unsigned i = ReservedVGPRFirst; i < MaxWorkGroupVGPRCount; ++i) {
unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
reserveRegisterTuples(Reserved, Reg);
}
}

return Reserved;		return Reserved;
}		}

unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,		unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
unsigned Idx) const {		unsigned Idx) const {
const SISubtarget &STI = MF.getSubtarget<SISubtarget>();		unsigned SGPRLimit = getMaxNumSGPRs(MF);
// FIXME: We should adjust the max number of waves based on LDS size.		unsigned VGPRLimit = getMaxNumVGPRs(MF);
unsigned SGPRLimit = getNumSGPRsAllowed(STI, STI.getMaxWavesPerCU());
unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU());

		tstellarAMDAuthorUnsubmitted Not Done Reply Inline Actions This actually uncovers a bug, because the scheduler will subtract the number of reserved registers from the value returned by this function, which in some cases gives us a negative pressure set limit. I think we need to remove this function before this patch can be committed. I will look into this. tstellarAMD: This actually uncovers a bug, because the scheduler will subtract the number of reserved…
		tstellarAMDAuthorUnsubmitted Done Reply Inline Actions The fix for this is more complicated than I thought. I think we should fix this by maintaining the current behavior of this function. This means that if the user does not specify any attributes that would limit the number of registers, then SGPRLimit and VGPRLimit should be set to the 10 waves per CU limits (24 VGPR and .48/80 SGPRS depending on the target). If the user does specify attributes to limit the number of registers the function should return AMDGPURegisterInfo::getRegPressureSetLimit(). tstellarAMD: The fix for this is more complicated than I thought. I think we should fix this by maintaining…
unsigned VSLimit = SGPRLimit + VGPRLimit;		unsigned VSLimit = SGPRLimit + VGPRLimit;

if (SGPRPressureSets.test(Idx) && VGPRPressureSets.test(Idx)) {		if (SGPRPressureSets.test(Idx) && VGPRPressureSets.test(Idx)) {
// FIXME: This is a hack. We should never be considering the pressure of		// FIXME: This is a hack. We should never be considering the pressure of
// these since no virtual register should ever have this class.		// these since no virtual register should ever have this class.
return VSLimit;		return VSLimit;
}		}

▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
int64_t Offset) const {		int64_t Offset) const {
MachineBasicBlock::iterator Ins = MBB->begin();		MachineBasicBlock::iterator Ins = MBB->begin();
DebugLoc DL; // Defaults to "unknown"		DebugLoc DL; // Defaults to "unknown"

if (Ins != MBB->end())		if (Ins != MBB->end())
DL = Ins->getDebugLoc();		DL = Ins->getDebugLoc();

MachineFunction *MF = MBB->getParent();		MachineFunction *MF = MBB->getParent();
const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();		const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
const SIInstrInfo *TII = Subtarget.getInstrInfo();		const SIInstrInfo *TII = Subtarget.getInstrInfo();
		arsenmUnsubmitted Done Reply Inline Actions SIInstrInfo arsenm: SIInstrInfo

if (Offset == 0) {		if (Offset == 0) {
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)		BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
.addFrameIndex(FrameIdx);		.addFrameIndex(FrameIdx);
return;		return;
}		}

MachineRegisterInfo &MRI = MF->getRegInfo();		MachineRegisterInfo &MRI = MF->getRegInfo();
unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);		unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);		unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)		BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
.addImm(Offset);		.addImm(Offset);
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg)		BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg)
.addReg(UnusedCarry, RegState::Define \| RegState::Dead)		.addReg(UnusedCarry, RegState::Define \| RegState::Dead)
.addReg(OffsetReg, RegState::Kill)		.addReg(OffsetReg, RegState::Kill)
.addFrameIndex(FrameIdx);		.addFrameIndex(FrameIdx);
}		}

void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,		void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
int64_t Offset) const {		int64_t Offset) const {

MachineBasicBlock *MBB = MI.getParent();		MachineBasicBlock *MBB = MI.getParent();
MachineFunction *MF = MBB->getParent();		MachineFunction *MF = MBB->getParent();
const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();		const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
const SIInstrInfo *TII = Subtarget.getInstrInfo();		const SIInstrInfo *TII = Subtarget.getInstrInfo();
		arsenmUnsubmitted Done Reply Inline Actions You can remove the cast arsenm: You can remove the cast

#ifndef NDEBUG		#ifndef NDEBUG
// FIXME: Is it possible to be storing a frame index to itself?		// FIXME: Is it possible to be storing a frame index to itself?
bool SeenFI = false;		bool SeenFI = false;
for (const MachineOperand &MO: MI.operands()) {		for (const MachineOperand &MO: MI.operands()) {
if (MO.isFI()) {		if (MO.isFI()) {
if (SeenFI)		if (SeenFI)
llvm_unreachable("should not see multiple frame indices");		llvm_unreachable("should not see multiple frame indices");
▲ Show 20 Lines • Show All 584 Lines • ▼ Show 20 Lines	SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
const MachineFunction &MF) const {		const MachineFunction &MF) const {

for (unsigned Reg : *RC)		for (unsigned Reg : *RC)
if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))		if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
return Reg;		return Reg;
return AMDGPU::NoRegister;		return AMDGPU::NoRegister;
}		}

unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {		bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
switch(WaveCount) {		unsigned Reg) const {
case 10: return 24;		const TargetRegisterClass *RC;
case 9: return 28;		if (TargetRegisterInfo::isVirtualRegister(Reg))
case 8: return 32;		RC = MRI.getRegClass(Reg);
case 7: return 36;		else
case 6: return 40;		RC = getPhysRegClass(Reg);
case 5: return 48;
case 4: return 64;		return hasVGPRs(RC);
case 3: return 84;		}
case 2: return 128;
default: return 256;		unsigned SIRegisterInfo::getTotalNumSGPRs(const SISubtarget &ST) const {
		if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
		return 800;
		return 512;
		}

		unsigned SIRegisterInfo::getNumAddressableSGPRs(const SISubtarget &ST) const {
		if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
		return 102;
		return 104;
		}

		unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST) const {
		if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
		return 6; // VCC, FLAT_SCRATCH, XNACK.
		arsenmUnsubmitted Not Done Reply Inline Actions This should be narrowed to just if the xnack feature is enabled arsenm: This should be narrowed to just if the xnack feature is enabled
		tstellarAMDAuthorUnsubmitted Not Done Reply Inline Actions We have to use 6 here for all VI, because FLAT_SCRATCH is located 2 sgprs below XNACK, so if you use FLAT_SCRATCH, you have to allocate space for XNACK too, even if the device does not support it. tstellarAMD: We have to use 6 here for all VI, because FLAT_SCRATCH is located 2 sgprs below XNACK, so if…
		return 2; // VCC.
		}

		unsigned SIRegisterInfo::getMinNumSGPRs(const SISubtarget &ST,
		unsigned WavesPerEU) const {
		if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
		switch (WavesPerEU) {
		case 0: return 0;
		case 10: return 0;
		case 9: return 0;
		case 8: return 81;
		default: return 97;
		}
		} else {
		switch (WavesPerEU) {
		case 0: return 0;
		case 10: return 0;
		case 9: return 49;
		case 8: return 57;
		case 7: return 65;
		case 6: return 73;
		case 5: return 81;
		default: return 97;
		}
}		}
}		}

unsigned SIRegisterInfo::getNumSGPRsAllowed(const SISubtarget &ST,		unsigned SIRegisterInfo::getMaxNumSGPRs(const SISubtarget &ST,
unsigned WaveCount) const {		unsigned WavesPerEU) const {
if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {		if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
switch (WaveCount) {		switch (WavesPerEU) {
		case 0: return 80;
case 10: return 80;		case 10: return 80;
case 9: return 80;		case 9: return 80;
case 8: return 96;		case 8: return 96;
default: return 102;		default: return getNumAddressableSGPRs(ST);
}		}
} else {		} else {
switch(WaveCount) {		switch (WavesPerEU) {
		case 0: return 48;
case 10: return 48;		case 10: return 48;
case 9: return 56;		case 9: return 56;
case 8: return 64;		case 8: return 64;
case 7: return 72;		case 7: return 72;
case 6: return 80;		case 6: return 80;
case 5: return 96;		case 5: return 96;
default: return 103;		default: return getNumAddressableSGPRs(ST);
}		}
}		}
}		}

bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,		unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const {
unsigned Reg) const {		const Function &F = *MF.getFunction();
const TargetRegisterClass *RC;
if (TargetRegisterInfo::isVirtualRegister(Reg))
RC = MRI.getRegClass(Reg);
else
RC = getPhysRegClass(Reg);

return hasVGPRs(RC);		const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
		const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();

		// Compute maximum number of SGPRs function can use using default/requested
		// minimum number of waves per execution unit.
		std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
		unsigned MaxNumSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first);

		// Check if maximum number of SGPRs was explicitly requested using
		// "amdgpu-num-sgpr" attribute.
		if (F.hasFnAttribute("amdgpu-num-sgpr")) {
		unsigned Requested = AMDGPU::getIntegerAttribute(
		F, "amdgpu-num-sgpr", MaxNumSGPRs);

		// Make sure requested value does not violate subtarget's specifications.
		if (Requested && Requested <= getNumReservedSGPRs(ST))
		Requested = 0;

		// Make sure requested value is compatible with values implied by
		// default/requested minimum/maximum number of waves per execution unit.
		if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first))
		Requested = 0;
		if (WavesPerEU.second &&
		Requested && Requested < getMinNumSGPRs(ST, WavesPerEU.second))
		Requested = 0;

		if (Requested)
		MaxNumSGPRs = Requested;
		}

		if (ST.hasSGPRInitBug())
		MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;

		return MaxNumSGPRs - getNumReservedSGPRs(ST);
		}

		unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs(
		const SISubtarget &ST) const {
		if (ST.debuggerReserveRegs())
		return 4;
		return 0;
		}

		unsigned SIRegisterInfo::getMinNumVGPRs(unsigned WavesPerEU) const {
		switch (WavesPerEU) {
		case 0: return 0;
		case 10: return 0;
		case 9: return 25;
		case 8: return 29;
		case 7: return 33;
		case 6: return 37;
		case 5: return 41;
		case 4: return 49;
		case 3: return 65;
		case 2: return 85;
		default: return 129;
		}
		}

		unsigned SIRegisterInfo::getMaxNumVGPRs(unsigned WavesPerEU) const {
		switch (WavesPerEU) {
		case 0: return 24;
		case 10: return 24;
		case 9: return 28;
		case 8: return 32;
		case 7: return 36;
		case 6: return 40;
		case 5: return 48;
		case 4: return 64;
		case 3: return 84;
		case 2: return 128;
		default: return getTotalNumVGPRs();
		}
		}

		unsigned SIRegisterInfo::getMaxNumVGPRs(const MachineFunction &MF) const {
		const Function &F = *MF.getFunction();

		const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
		const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();

		// Compute maximum number of VGPRs function can use using default/requested
		// minimum number of waves per execution unit.
		std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
		unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);

		// Check if maximum number of VGPRs was explicitly requested using
		// "amdgpu-num-vgpr" attribute.
		if (F.hasFnAttribute("amdgpu-num-vgpr")) {
		unsigned Requested = AMDGPU::getIntegerAttribute(
		F, "amdgpu-num-vgpr", MaxNumVGPRs);

		// Make sure requested value does not violate subtarget's specifications.
		if (Requested && Requested <= getNumDebuggerReservedVGPRs(ST))
		Requested = 0;

		// Make sure requested value is compatible with values implied by
		// default/requested minimum/maximum number of waves per execution unit.
		if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
		Requested = 0;
		if (WavesPerEU.second &&
		Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
		Requested = 0;

		if (Requested)
		MaxNumVGPRs = Requested;
		}

		return MaxNumVGPRs - getNumDebuggerReservedVGPRs(ST);
}		}

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

	Show All 39 Lines
	MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);			MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);

	MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);			MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);

	bool isGroupSegment(const GlobalValue *GV);			bool isGroupSegment(const GlobalValue *GV);
	bool isGlobalSegment(const GlobalValue *GV);			bool isGlobalSegment(const GlobalValue *GV);
	bool isReadOnlySegment(const GlobalValue *GV);			bool isReadOnlySegment(const GlobalValue *GV);

				/// \returns Integer value requested using \p F's \p Name attribute.
				///
				/// \returns \p Default if attribute is not present.
				///
				/// \returns \p Default and emits error if requested value cannot be converted
				/// to integer.
	int getIntegerAttribute(const Function &F, StringRef Name, int Default);			int getIntegerAttribute(const Function &F, StringRef Name, int Default);

	unsigned getMaximumWorkGroupSize(const Function &F);			/// \returns A pair of integer values requested using \p F's \p Name attribute
				/// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
				/// is false).
				///
				/// \returns \p Default if attribute is not present.
				///
				/// \returns \p Default and emits error if one of the requested values cannot be
				/// converted to integer, or \p OnlyFirstRequired is false and "second" value is
				/// not present.
				std::pair<int, int> getIntegerPairAttribute(const Function &F,
				StringRef Name,
				std::pair<int, int> Default,
				bool OnlyFirstRequired = false);

	unsigned getInitialPSInputAddr(const Function &F);			unsigned getInitialPSInputAddr(const Function &F);

	bool isShader(CallingConv::ID cc);			bool isShader(CallingConv::ID cc);
	bool isCompute(CallingConv::ID cc);			bool isCompute(CallingConv::ID cc);

	bool isSI(const MCSubtargetInfo &STI);			bool isSI(const MCSubtargetInfo &STI);
	bool isCI(const MCSubtargetInfo &STI);			bool isCI(const MCSubtargetInfo &STI);
	bool isVI(const MCSubtargetInfo &STI);			bool isVI(const MCSubtargetInfo &STI);
	Show All 9 Lines

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Show First 20 Lines • Show All 118 Lines • ▼ Show 20 Lines	if (Str.getAsInteger(0, Result)) {
LLVMContext &Ctx = F.getContext();		LLVMContext &Ctx = F.getContext();
Ctx.emitError("can't parse integer attribute " + Name);		Ctx.emitError("can't parse integer attribute " + Name);
}		}
}		}

return Result;		return Result;
}		}

unsigned getMaximumWorkGroupSize(const Function &F) {		std::pair<int, int> getIntegerPairAttribute(const Function &F,
return getIntegerAttribute(F, "amdgpu-max-work-group-size", 256);		StringRef Name,
tstellarAMDAuthorUnsubmitted Done Reply Inline Actions Mesa is using this attribute, so we still need to support it. We should also leave some tests which use it. tstellarAMD: Mesa is using this attribute, so we still need to support it. We should also leave some tests…
		std::pair<int, int> Default,
		bool OnlyFirstRequired) {
		Attribute A = F.getFnAttribute(Name);
		if (!A.isStringAttribute())
		return Default;

		LLVMContext &Ctx = F.getContext();
		std::pair<int, int> Ints = Default;
		std::pair<StringRef, StringRef> Strs = A.getValueAsString().split(',');
		if (Strs.first.trim().getAsInteger(0, Ints.first)) {
		Ctx.emitError("can't parse first integer attribute " + Name);
		return Default;
		}
		if (Strs.second.trim().getAsInteger(0, Ints.second)) {
		if (!OnlyFirstRequired \|\| Strs.second.trim().size()) {
		Ctx.emitError("can't parse second integer attribute " + Name);
		return Default;
		}
		}

		return Ints;
}		}

unsigned getInitialPSInputAddr(const Function &F) {		unsigned getInitialPSInputAddr(const Function &F) {
return getIntegerAttribute(F, "InitialPSInputAddr", 0);		return getIntegerAttribute(F, "InitialPSInputAddr", 0);
}		}

bool isShader(CallingConv::ID cc) {		bool isShader(CallingConv::ID cc) {
switch(cc) {		switch(cc) {
▲ Show 20 Lines • Show All 47 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/amdgpu.private-memory.ll

	Show First 20 Lines • Show All 539 Lines • ▼ Show 20 Lines
	entry:			entry:
	%tmp = alloca [1 x i32]			%tmp = alloca [1 x i32]
	store [1 x i32] [i32 0], [1 x i32]* %tmp			store [1 x i32] [i32 0], [1 x i32]* %tmp
	%load = load [1 x i32], [1 x i32]* %tmp			%load = load [1 x i32], [1 x i32]* %tmp
	store [1 x i32] %load, [1 x i32] addrspace(1)* %out			store [1 x i32] %load, [1 x i32] addrspace(1)* %out
	ret void			ret void
	}			}

	attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" }			attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }

	; HSAOPT: !0 = !{}			; HSAOPT: !0 = !{}
	; HSAOPT: !1 = !{i32 0, i32 2048}			; HSAOPT: !1 = !{i32 0, i32 2048}

	; NOHSAOPT: !0 = !{i32 0, i32 2048}			; NOHSAOPT: !0 = !{i32 0, i32 2048}

test/CodeGen/AMDGPU/array-ptr-calc-i32.ll

Show First 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
; Dummy call		; Dummy call
call void @llvm.amdgcn.s.barrier()		call void @llvm.amdgcn.s.barrier()
%reload = load i32, i32* %alloca_ptr, align 4		%reload = load i32, i32* %alloca_ptr, align 4
%out_ptr = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid		%out_ptr = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
store i32 %reload, i32 addrspace(1)* %out_ptr, align 4		store i32 %reload, i32 addrspace(1)* %out_ptr, align 4
ret void		ret void
}		}

attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }		attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
attributes #1 = { nounwind readnone }		attributes #1 = { nounwind readnone }
attributes #2 = { nounwind convergent }		attributes #2 = { nounwind convergent }

test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll

				; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s \| FileCheck %s

				; CHECK-LABEL: {{^}}empty_64_64:
				; CHECK: SGPRBlocks: 0
				; CHECK: VGPRBlocks: 0
				; CHECK: NumSGPRsForWavesPerEU: 1
				; CHECK: NumVGPRsForWavesPerEU: 1
				define void @empty_64_64() #0 {
				entry:
				ret void
				}
				attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}

				; CHECK-LABEL: {{^}}empty_64_128:
				; CHECK: SGPRBlocks: 0
				; CHECK: VGPRBlocks: 0
				; CHECK: NumSGPRsForWavesPerEU: 1
				; CHECK: NumVGPRsForWavesPerEU: 1
				define void @empty_64_128() #1 {
				entry:
				ret void
				}
				attributes #1 = {"amdgpu-flat-work-group-size"="64,128"}

				; CHECK-LABEL: {{^}}empty_128_128:
				; CHECK: SGPRBlocks: 10
				; CHECK: VGPRBlocks: 7
				; CHECK: NumSGPRsForWavesPerEU: 81
				; CHECK: NumVGPRsForWavesPerEU: 29
				define void @empty_128_128() #2 {
				entry:
				ret void
				}
				attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}

				@var = addrspace(1) global float 0.0

				; CHECK-LABEL: {{^}}exactly_256_256:
				; CHECK: SGPRBlocks: 2
				; CHECK: VGPRBlocks: 5
				; CHECK: NumSGPRsForWavesPerEU: 19
				; CHECK: NumVGPRsForWavesPerEU: 24
				define void @exactly_256_256() #3 {
				%val0 = load volatile float, float addrspace(1)* @var
				%val1 = load volatile float, float addrspace(1)* @var
				%val2 = load volatile float, float addrspace(1)* @var
				%val3 = load volatile float, float addrspace(1)* @var
				%val4 = load volatile float, float addrspace(1)* @var
				%val5 = load volatile float, float addrspace(1)* @var
				%val6 = load volatile float, float addrspace(1)* @var
				%val7 = load volatile float, float addrspace(1)* @var
				%val8 = load volatile float, float addrspace(1)* @var
				%val9 = load volatile float, float addrspace(1)* @var
				%val10 = load volatile float, float addrspace(1)* @var
				%val11 = load volatile float, float addrspace(1)* @var
				%val12 = load volatile float, float addrspace(1)* @var
				%val13 = load volatile float, float addrspace(1)* @var
				%val14 = load volatile float, float addrspace(1)* @var
				%val15 = load volatile float, float addrspace(1)* @var
				%val16 = load volatile float, float addrspace(1)* @var
				%val17 = load volatile float, float addrspace(1)* @var
				%val18 = load volatile float, float addrspace(1)* @var
				%val19 = load volatile float, float addrspace(1)* @var
				%val20 = load volatile float, float addrspace(1)* @var
				%val21 = load volatile float, float addrspace(1)* @var
				%val22 = load volatile float, float addrspace(1)* @var
				%val23 = load volatile float, float addrspace(1)* @var
				%val24 = load volatile float, float addrspace(1)* @var
				%val25 = load volatile float, float addrspace(1)* @var
				%val26 = load volatile float, float addrspace(1)* @var
				%val27 = load volatile float, float addrspace(1)* @var
				%val28 = load volatile float, float addrspace(1)* @var
				%val29 = load volatile float, float addrspace(1)* @var
				%val30 = load volatile float, float addrspace(1)* @var

				store volatile float %val0, float addrspace(1)* @var
				store volatile float %val1, float addrspace(1)* @var
				store volatile float %val2, float addrspace(1)* @var
				store volatile float %val3, float addrspace(1)* @var
				store volatile float %val4, float addrspace(1)* @var
				store volatile float %val5, float addrspace(1)* @var
				store volatile float %val6, float addrspace(1)* @var
				store volatile float %val7, float addrspace(1)* @var
				store volatile float %val8, float addrspace(1)* @var
				store volatile float %val9, float addrspace(1)* @var
				store volatile float %val10, float addrspace(1)* @var
				store volatile float %val11, float addrspace(1)* @var
				store volatile float %val12, float addrspace(1)* @var
				store volatile float %val13, float addrspace(1)* @var
				store volatile float %val14, float addrspace(1)* @var
				store volatile float %val15, float addrspace(1)* @var
				store volatile float %val16, float addrspace(1)* @var
				store volatile float %val17, float addrspace(1)* @var
				store volatile float %val18, float addrspace(1)* @var
				store volatile float %val19, float addrspace(1)* @var
				store volatile float %val20, float addrspace(1)* @var
				store volatile float %val21, float addrspace(1)* @var
				store volatile float %val22, float addrspace(1)* @var
				store volatile float %val23, float addrspace(1)* @var
				store volatile float %val24, float addrspace(1)* @var
				store volatile float %val25, float addrspace(1)* @var
				store volatile float %val26, float addrspace(1)* @var
				store volatile float %val27, float addrspace(1)* @var
				store volatile float %val28, float addrspace(1)* @var
				store volatile float %val29, float addrspace(1)* @var
				store volatile float %val30, float addrspace(1)* @var

				ret void
				}
				attributes #3 = { "amdgpu-flat-work-group-size"="256,256" }

test/CodeGen/AMDGPU/attr-amdgpu-num-active-waves-per-eu.ll

				; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s \| FileCheck %s

				; Exactly 1 active wave per execution unit.
				; CHECK-LABEL: {{^}}empty_exactly_1:
				; CHECK: SGPRBlocks: 12
				; CHECK: VGPRBlocks: 32
				; CHECK: NumSGPRsForWavesPerEU: 97
				; CHECK: NumVGPRsForWavesPerEU: 129
				define void @empty_exactly_1() #0 {
				entry:
				ret void
				}
				attributes #0 = {"amdgpu-waves-per-eu"="1,1"}

				; Exactly 5 active waves per execution unit.
				; CHECK-LABEL: {{^}}empty_exactly_5:
				; CHECK: SGPRBlocks: 12
				; CHECK: VGPRBlocks: 10
				; CHECK: NumSGPRsForWavesPerEU: 97
				; CHECK: NumVGPRsForWavesPerEU: 41
				define void @empty_exactly_5() #1 {
				entry:
				ret void
				}
				attributes #1 = {"amdgpu-waves-per-eu"="5,5"}

				; Exactly 10 active waves per execution unit.
				; CHECK-LABEL: {{^}}empty_exactly_10:
				; CHECK: SGPRBlocks: 0
				; CHECK: VGPRBlocks: 0
				; CHECK: NumSGPRsForWavesPerEU: 1
				; CHECK: NumVGPRsForWavesPerEU: 1
				define void @empty_exactly_10() #2 {
				entry:
				ret void
				}
				attributes #2 = {"amdgpu-waves-per-eu"="10,10" "amdgpu-flat-work-group-size"="256,256"}

				; At least 1 active wave per execution unit.
				; CHECK-LABEL: {{^}}empty_at_least_1:
				; CHECK: SGPRBlocks: 0
				; CHECK: VGPRBlocks: 0
				; CHECK: NumSGPRsForWavesPerEU: 1
				; CHECK: NumVGPRsForWavesPerEU: 1
				define void @empty_at_least_1() #3 {
				entry:
				ret void
				}
				attributes #3 = {"amdgpu-waves-per-eu"="1"}

				; At least 5 active waves per execution unit.
				; CHECK-LABEL: {{^}}empty_at_least_5:
				; CHECK: SGPRBlocks: 0
				; CHECK: VGPRBlocks: 0
				; CHECK: NumSGPRsForWavesPerEU: 1
				; CHECK: NumVGPRsForWavesPerEU: 1
				define void @empty_at_least_5() #4 {
				entry:
				ret void
				}
				attributes #4 = {"amdgpu-waves-per-eu"="5"}

				; At least 10 active waves per execution unit.
				; CHECK-LABEL: {{^}}empty_at_least_10:
				; CHECK: SGPRBlocks: 0
				; CHECK: VGPRBlocks: 0
				; CHECK: NumSGPRsForWavesPerEU: 1
				; CHECK: NumVGPRsForWavesPerEU: 1
				define void @empty_at_least_10() #5 {
				entry:
				ret void
				}
				attributes #5 = {"amdgpu-waves-per-eu"="10" "amdgpu-flat-work-group-size"="256,256"}

				; At most 1 active wave per execution unit (same as @empty_exactly_1).

				; At most 5 active waves per execution unit.
				; CHECK-LABEL: {{^}}empty_at_most_5:
				; CHECK: SGPRBlocks: 12
				; CHECK: VGPRBlocks: 10
				; CHECK: NumSGPRsForWavesPerEU: 97
				; CHECK: NumVGPRsForWavesPerEU: 41
				define void @empty_at_most_5() #6 {
				entry:
				ret void
				}
				attributes #6 = {"amdgpu-waves-per-eu"="1,5"}

				; At most 10 active waves per execution unit.
				; CHECK-LABEL: {{^}}empty_at_most_10:
				; CHECK: SGPRBlocks: 0
				; CHECK: VGPRBlocks: 0
				; CHECK: NumSGPRsForWavesPerEU: 1
				; CHECK: NumVGPRsForWavesPerEU: 1
				define void @empty_at_most_10() #7 {
				entry:
				ret void
				}
				attributes #7 = {"amdgpu-waves-per-eu"="1,10"}

				; Between 1 and 5 active waves per execution unit (same as @empty_at_most_5).

				; Between 5 and 10 active waves per execution unit.
				; CHECK-LABEL: {{^}}empty_between_5_and_10:
				; CHECK: SGPRBlocks: 0
				; CHECK: VGPRBlocks: 0
				; CHECK: NumSGPRsForWavesPerEU: 1
				; CHECK: NumVGPRsForWavesPerEU: 1
				define void @empty_between_5_and_10() #8 {
				entry:
				ret void
				}
				attributes #8 = {"amdgpu-waves-per-eu"="5,10"}

				@var = addrspace(1) global float 0.0

				; Exactly 10 active waves per execution unit.
				; CHECK-LABEL: {{^}}exactly_10:
				; CHECK: SGPRBlocks: 2
				; CHECK: VGPRBlocks: 5
				; CHECK: NumSGPRsForWavesPerEU: 19
				; CHECK: NumVGPRsForWavesPerEU: 24
				define void @exactly_10() #9 {
				%val0 = load volatile float, float addrspace(1)* @var
				%val1 = load volatile float, float addrspace(1)* @var
				%val2 = load volatile float, float addrspace(1)* @var
				%val3 = load volatile float, float addrspace(1)* @var
				%val4 = load volatile float, float addrspace(1)* @var
				%val5 = load volatile float, float addrspace(1)* @var
				%val6 = load volatile float, float addrspace(1)* @var
				%val7 = load volatile float, float addrspace(1)* @var
				%val8 = load volatile float, float addrspace(1)* @var
				%val9 = load volatile float, float addrspace(1)* @var
				%val10 = load volatile float, float addrspace(1)* @var
				%val11 = load volatile float, float addrspace(1)* @var
				%val12 = load volatile float, float addrspace(1)* @var
				%val13 = load volatile float, float addrspace(1)* @var
				%val14 = load volatile float, float addrspace(1)* @var
				%val15 = load volatile float, float addrspace(1)* @var
				%val16 = load volatile float, float addrspace(1)* @var
				%val17 = load volatile float, float addrspace(1)* @var
				%val18 = load volatile float, float addrspace(1)* @var
				%val19 = load volatile float, float addrspace(1)* @var
				%val20 = load volatile float, float addrspace(1)* @var
				%val21 = load volatile float, float addrspace(1)* @var
				%val22 = load volatile float, float addrspace(1)* @var
				%val23 = load volatile float, float addrspace(1)* @var
				%val24 = load volatile float, float addrspace(1)* @var
				%val25 = load volatile float, float addrspace(1)* @var
				%val26 = load volatile float, float addrspace(1)* @var
				%val27 = load volatile float, float addrspace(1)* @var
				%val28 = load volatile float, float addrspace(1)* @var
				%val29 = load volatile float, float addrspace(1)* @var
				%val30 = load volatile float, float addrspace(1)* @var

				store volatile float %val0, float addrspace(1)* @var
				store volatile float %val1, float addrspace(1)* @var
				store volatile float %val2, float addrspace(1)* @var
				store volatile float %val3, float addrspace(1)* @var
				store volatile float %val4, float addrspace(1)* @var
				store volatile float %val5, float addrspace(1)* @var
				store volatile float %val6, float addrspace(1)* @var
				store volatile float %val7, float addrspace(1)* @var
				store volatile float %val8, float addrspace(1)* @var
				store volatile float %val9, float addrspace(1)* @var
				store volatile float %val10, float addrspace(1)* @var
				store volatile float %val11, float addrspace(1)* @var
				store volatile float %val12, float addrspace(1)* @var
				store volatile float %val13, float addrspace(1)* @var
				store volatile float %val14, float addrspace(1)* @var
				store volatile float %val15, float addrspace(1)* @var
				store volatile float %val16, float addrspace(1)* @var
				store volatile float %val17, float addrspace(1)* @var
				store volatile float %val18, float addrspace(1)* @var
				store volatile float %val19, float addrspace(1)* @var
				store volatile float %val20, float addrspace(1)* @var
				store volatile float %val21, float addrspace(1)* @var
				store volatile float %val22, float addrspace(1)* @var
				store volatile float %val23, float addrspace(1)* @var
				store volatile float %val24, float addrspace(1)* @var
				store volatile float %val25, float addrspace(1)* @var
				store volatile float %val26, float addrspace(1)* @var
				store volatile float %val27, float addrspace(1)* @var
				store volatile float %val28, float addrspace(1)* @var
				store volatile float %val29, float addrspace(1)* @var
				store volatile float %val30, float addrspace(1)* @var

				ret void
				}
				attributes #9 = { "amdgpu-waves-per-eu"="10,10" "amdgpu-flat-work-group-size"="256,256" }

test/CodeGen/AMDGPU/attr-amdgpu-num-gpr.ll

				; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s \| FileCheck %s

				; CHECK-LABEL: {{^}}num_sgpr:
				; CHECK: SGPRBlocks: 1
				; CHECK: NumSGPRsForWavesPerEU: 13
				define void @num_sgpr(i32 addrspace(1)* %out1,
				i32 addrspace(1)* %out2,
				i32 addrspace(1)* %out3,
				i32 addrspace(1)* %out4,
				i32 %one, i32 %two, i32 %three, i32 %four) #0 {
				store i32 %one, i32 addrspace(1)* %out1
				store i32 %two, i32 addrspace(1)* %out2
				store i32 %three, i32 addrspace(1)* %out3
				store i32 %four, i32 addrspace(1)* %out4
				ret void
				}
				attributes #0 = { "amdgpu-num-sgpr"="18" }

				@var = addrspace(1) global float 0.0

				; CHECK-LABEL: {{^}}num_vgpr:
				; CHECK: VGPRBlocks: 4
				; CHECK: NumVGPRsForWavesPerEU: 20
				define void @num_vgpr() #1 {
				%val0 = load volatile float, float addrspace(1)* @var
				%val1 = load volatile float, float addrspace(1)* @var
				%val2 = load volatile float, float addrspace(1)* @var
				%val3 = load volatile float, float addrspace(1)* @var
				%val4 = load volatile float, float addrspace(1)* @var
				%val5 = load volatile float, float addrspace(1)* @var
				%val6 = load volatile float, float addrspace(1)* @var
				%val7 = load volatile float, float addrspace(1)* @var
				%val8 = load volatile float, float addrspace(1)* @var
				%val9 = load volatile float, float addrspace(1)* @var
				%val10 = load volatile float, float addrspace(1)* @var
				%val11 = load volatile float, float addrspace(1)* @var
				%val12 = load volatile float, float addrspace(1)* @var
				%val13 = load volatile float, float addrspace(1)* @var
				%val14 = load volatile float, float addrspace(1)* @var
				%val15 = load volatile float, float addrspace(1)* @var
				%val16 = load volatile float, float addrspace(1)* @var
				%val17 = load volatile float, float addrspace(1)* @var
				%val18 = load volatile float, float addrspace(1)* @var
				%val19 = load volatile float, float addrspace(1)* @var
				%val20 = load volatile float, float addrspace(1)* @var
				%val21 = load volatile float, float addrspace(1)* @var
				%val22 = load volatile float, float addrspace(1)* @var
				%val23 = load volatile float, float addrspace(1)* @var
				%val24 = load volatile float, float addrspace(1)* @var
				%val25 = load volatile float, float addrspace(1)* @var
				%val26 = load volatile float, float addrspace(1)* @var
				%val27 = load volatile float, float addrspace(1)* @var
				%val28 = load volatile float, float addrspace(1)* @var
				%val29 = load volatile float, float addrspace(1)* @var
				%val30 = load volatile float, float addrspace(1)* @var

				store volatile float %val0, float addrspace(1)* @var
				store volatile float %val1, float addrspace(1)* @var
				store volatile float %val2, float addrspace(1)* @var
				store volatile float %val3, float addrspace(1)* @var
				store volatile float %val4, float addrspace(1)* @var
				store volatile float %val5, float addrspace(1)* @var
				store volatile float %val6, float addrspace(1)* @var
				store volatile float %val7, float addrspace(1)* @var
				store volatile float %val8, float addrspace(1)* @var
				store volatile float %val9, float addrspace(1)* @var
				store volatile float %val10, float addrspace(1)* @var
				store volatile float %val11, float addrspace(1)* @var
				store volatile float %val12, float addrspace(1)* @var
				store volatile float %val13, float addrspace(1)* @var
				store volatile float %val14, float addrspace(1)* @var
				store volatile float %val15, float addrspace(1)* @var
				store volatile float %val16, float addrspace(1)* @var
				store volatile float %val17, float addrspace(1)* @var
				store volatile float %val18, float addrspace(1)* @var
				store volatile float %val19, float addrspace(1)* @var
				store volatile float %val20, float addrspace(1)* @var
				store volatile float %val21, float addrspace(1)* @var
				store volatile float %val22, float addrspace(1)* @var
				store volatile float %val23, float addrspace(1)* @var
				store volatile float %val24, float addrspace(1)* @var
				store volatile float %val25, float addrspace(1)* @var
				store volatile float %val26, float addrspace(1)* @var
				store volatile float %val27, float addrspace(1)* @var
				store volatile float %val28, float addrspace(1)* @var
				store volatile float %val29, float addrspace(1)* @var
				store volatile float %val30, float addrspace(1)* @var

				ret void
				}
				attributes #1 = { "amdgpu-num-vgpr"="20" }

test/CodeGen/AMDGPU/attr-unparseable.ll

				; RUN: not llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s 2>&1 \| FileCheck %s

				; CHECK: can't parse integer attribute amdgpu-num-sgpr
				define void @unparseable_single_0() #0 {
				entry:
				ret void
				}
				attributes #0 = { "amdgpu-num-sgpr" }

				; CHECK: can't parse integer attribute amdgpu-num-sgpr
				define void @unparseable_single_1() #1 {
				entry:
				ret void
				}
				attributes #1 = { "amdgpu-num-sgpr"="k" }

				; CHECK: can't parse integer attribute amdgpu-num-sgpr
				define void @unparseable_single_2() #2 {
				entry:
				ret void
				}
				attributes #2 = { "amdgpu-num-sgpr"="1,2" }

				; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
				define void @unparseable_pair_0() #3 {
				entry:
				ret void
				}
				attributes #3 = { "amdgpu-flat-work-group-size" }

				; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
				define void @unparseable_pair_1() #4 {
				entry:
				ret void
				}
				attributes #4 = { "amdgpu-flat-work-group-size"="k" }

				; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
				define void @unparseable_pair_2() #5 {
				entry:
				ret void
				}
				attributes #5 = { "amdgpu-flat-work-group-size"="1" }

				; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
				define void @unparseable_pair_3() #6 {
				entry:
				ret void
				}
				attributes #6 = { "amdgpu-flat-work-group-size"="1,k" }

				; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
				define void @unparseable_pair_4() #7 {
				entry:
				ret void
				}
				attributes #7 = { "amdgpu-flat-work-group-size"="1,2,3" }

test/CodeGen/AMDGPU/indirect-private-64.ll

Show First 20 Lines • Show All 115 Lines • ▼ Show 20 Lines	define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
store <2 x i64> %val, <2 x i64>* %ptr, align 16		store <2 x i64> %val, <2 x i64>* %ptr, align 16
call void @llvm.amdgcn.s.barrier()		call void @llvm.amdgcn.s.barrier()
%result = load <2 x i64>, <2 x i64>* %ptr, align 16		%result = load <2 x i64>, <2 x i64>* %ptr, align 16
store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16		store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
ret void		ret void
}		}

attributes #0 = { convergent nounwind }		attributes #0 = { convergent nounwind }
attributes #1 = { nounwind "amdgpu-max-waves-per-eu"="2" "amdgpu-max-work-group-size"="64" }		attributes #1 = { nounwind "amdgpu-waves-per-eu"="2,2" "amdgpu-flat-work-group-size"="64,64" }

test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll

Show First 20 Lines • Show All 249 Lines • ▼ Show 20 Lines	entry:
%arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 1		%arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 1
%tmp3 = load i8, i8* %arrayidx12, align 1		%tmp3 = load i8, i8* %arrayidx12, align 1
%arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1		%arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1		store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
ret void		ret void
}		}

attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }		attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
attributes #1 = { nounwind "amdgpu-max-waves-per-eu"="3" "amdgpu-max-work-group-size"="256" }		attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="256,256" }
attributes #2 = { nounwind "amdgpu-max-waves-per-eu"="1" "amdgpu-max-work-group-size"="1600" }		attributes #2 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1600,1600" }
attributes #3 = { nounwind "amdgpu-max-waves-per-eu"="0" }		attributes #3 = { nounwind "amdgpu-waves-per-eu"="1,10" }
attributes #4 = { nounwind "amdgpu-max-waves-per-eu"="-1" }		attributes #4 = { nounwind "amdgpu-waves-per-eu"="1,10" }
attributes #5 = { nounwind "amdgpu-max-waves-per-eu"="6" "amdgpu-max-work-group-size"="64" }		attributes #5 = { nounwind "amdgpu-waves-per-eu"="1,6" "amdgpu-flat-work-group-size"="64,64" }
attributes #6 = { nounwind "amdgpu-max-waves-per-eu"="8" "amdgpu-max-work-group-size"="64" }		attributes #6 = { nounwind "amdgpu-waves-per-eu"="1,8" "amdgpu-flat-work-group-size"="64,64" }
attributes #7 = { nounwind "amdgpu-max-waves-per-eu"="9" "amdgpu-max-work-group-size"="64" }		attributes #7 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="64,64" }

test/CodeGen/AMDGPU/large-work-group-registers.ll

	; RUN: llc -march=amdgcn -mcpu=tonga -regalloc=basic -post-RA-scheduler=0 < %s \| FileCheck %s			; RUN: llc -march=amdgcn -mcpu=tonga -regalloc=basic -post-RA-scheduler=0 < %s \| FileCheck %s

	; CHECK: NumVgprs: 64			; CHECK: NumVgprs: 32
	define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 {			define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 {
	main_body:			main_body:
	%8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8			%8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8
	%9 = load <4 x i32>, <4 x i32> addrspace(2)* %8, align 16, !tbaa !0			%9 = load <4 x i32>, <4 x i32> addrspace(2)* %8, align 16, !tbaa !0
	%10 = extractelement <3 x i32> %7, i32 0			%10 = extractelement <3 x i32> %7, i32 0
	%11 = extractelement <3 x i32> %7, i32 1			%11 = extractelement <3 x i32> %7, i32 1
	%12 = mul i32 %10, %11			%12 = mul i32 %10, %11
	%bc = bitcast <3 x i32> %7 to <3 x float>			%bc = bitcast <3 x i32> %7 to <3 x float>
	Show All 30 Lines

test/CodeGen/AMDGPU/load-constant-i16.ll

Show First 20 Lines • Show All 254 Lines • ▼ Show 20 Lines	define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
%ext = sext <16 x i16> %load to <16 x i32>		%ext = sext <16 x i16> %load to <16 x i32>
store <16 x i32> %ext, <16 x i32> addrspace(1)* %out		store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i32:		; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i32:
; GCN-DAG: s_load_dwordx16		; GCN-DAG: s_load_dwordx16
; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}		; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]		; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16		; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {		define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
%load = load <32 x i16>, <32 x i16> addrspace(2)* %in		%load = load <32 x i16>, <32 x i16> addrspace(2)* %in
%ext = zext <32 x i16> %load to <32 x i32>		%ext = zext <32 x i16> %load to <32 x i32>
store <32 x i32> %ext, <32 x i32> addrspace(1)* %out		store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}constant_sextload_v32i16_to_v32i32:		; FUNC-LABEL: {{^}}constant_sextload_v32i16_to_v32i32:
▲ Show 20 Lines • Show All 169 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/load-local-i16.ll

Show First 20 Lines • Show All 261 Lines • ▼ Show 20 Lines	define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
%load = load <16 x i16>, <16 x i16> addrspace(3)* %in		%load = load <16 x i16>, <16 x i16> addrspace(3)* %in
%ext = sext <16 x i16> %load to <16 x i32>		%ext = sext <16 x i16> %load to <16 x i32>
store <16 x i32> %ext, <16 x i32> addrspace(3)* %out		store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32:		; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32:
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:6
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:3
define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {		define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #1 {
%load = load <32 x i16>, <32 x i16> addrspace(3)* %in		%load = load <32 x i16>, <32 x i16> addrspace(3)* %in
%ext = zext <32 x i16> %load to <32 x i32>		%ext = zext <32 x i16> %load to <32 x i32>
store <32 x i32> %ext, <32 x i32> addrspace(3)* %out		store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:		; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5{{$}}		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {		define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #1 {
%load = load <32 x i16>, <32 x i16> addrspace(3)* %in		%load = load <32 x i16>, <32 x i16> addrspace(3)* %in
%ext = sext <32 x i16> %load to <32 x i32>		%ext = sext <32 x i16> %load to <32 x i32>
store <32 x i32> %ext, <32 x i32> addrspace(3)* %out		store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
ret void		ret void
}		}

; FIXME: Missed read2
; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:		; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:11 offset1:15		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:15 offset1:12
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:64		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:9 offset1:10		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13		; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:13 offset1:14
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:112		define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #1 {
define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
%load = load <64 x i16>, <64 x i16> addrspace(3)* %in		%load = load <64 x i16>, <64 x i16> addrspace(3)* %in
%ext = zext <64 x i16> %load to <64 x i32>		%ext = zext <64 x i16> %load to <64 x i32>
store <64 x i32> %ext, <64 x i32> addrspace(3)* %out		store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32:		; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32:
define void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {		define void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
▲ Show 20 Lines • Show All 135 Lines • ▼ Show 20 Lines
; define void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {		; define void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
; %load = load <64 x i16>, <64 x i16> addrspace(3)* %in		; %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
; %ext = sext <64 x i16> %load to <64 x i64>		; %ext = sext <64 x i16> %load to <64 x i64>
; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out		; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
; ret void		; ret void
; }		; }

attributes #0 = { nounwind }		attributes #0 = { nounwind }
		attributes #1 = { nounwind "target-features"="-promote-alloca" "amdgpu-waves-per-eu"="2" }

test/CodeGen/AMDGPU/private-memory-r600.ll

Show First 20 Lines • Show All 291 Lines • ▼ Show 20 Lines	define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
%tmp4 = getelementptr inbounds i32, i32* %tmp3, i32 %b		%tmp4 = getelementptr inbounds i32, i32* %tmp3, i32 %b
%tmp5 = load i32, i32* %tmp4		%tmp5 = load i32, i32* %tmp4
store i32 %tmp5, i32 addrspace(1)* %out		store i32 %tmp5, i32 addrspace(1)* %out
ret void		ret void
}		}

; OPT: !0 = !{i32 0, i32 2048}		; OPT: !0 = !{i32 0, i32 2048}

attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" }		attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }

test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll

	Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines
	define void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {			define void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
	%alloca = alloca [17 x i32], align 4			%alloca = alloca [17 x i32], align 4
	%alloca.bc = bitcast [17 x i32]* %alloca to i8*			%alloca.bc = bitcast [17 x i32]* %alloca to i8*
	%size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false)			%size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false)
	store i32 %size, i32 addrspace(1)* %out			store i32 %size, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" "amdgpu-max-waves-per-eu"="3" }			attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="3,3" }
	attributes #1 = { nounwind readnone }			attributes #1 = { nounwind readnone }

test/CodeGen/AMDGPU/promote-alloca-no-opts.ll

Show All 28 Lines	entry:
store i32 0, i32* %gep0		store i32 0, i32* %gep0
store i32 1, i32* %gep1		store i32 1, i32* %gep1
%gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index		%gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
%load = load i32, i32* %gep2		%load = load i32, i32* %gep2
store i32 %load, i32 addrspace(1)* %out		store i32 %load, i32 addrspace(1)* %out
ret void		ret void
}		}

attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }		attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" }
attributes #1 = { nounwind optnone noinline "amdgpu-max-work-group-size"="64" }		attributes #1 = { nounwind optnone noinline "amdgpu-flat-work-group-size"="64,64" }

test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll

Show First 20 Lines • Show All 121 Lines • ▼ Show 20 Lines	entry:
store volatile i32 0, i32 addrspace(3)* %gep.lds3, align 4		store volatile i32 0, i32 addrspace(3)* %gep.lds3, align 4

%gep.lds4 = getelementptr inbounds [63 x <4 x i32>], [63 x <4 x i32>] addrspace(3)* @lds4, i32 0, i32 %idx		%gep.lds4 = getelementptr inbounds [63 x <4 x i32>], [63 x <4 x i32>] addrspace(3)* @lds4, i32 0, i32 %idx
store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds4, align 16		store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds4, align 16

ret void		ret void
}		}

attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }		attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,7" }

test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll

Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	define void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
%cmp = icmp eq i32* %ptr0, %ptr1		%cmp = icmp eq i32* %ptr0, %ptr1
%zext = zext i1 %cmp to i32		%zext = zext i1 %cmp to i32
store volatile i32 %zext, i32 addrspace(1)* %out		store volatile i32 %zext, i32 addrspace(1)* %out
ret void		ret void
}		}

declare i32* @get_unknown_pointer() #0		declare i32* @get_unknown_pointer() #0

attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }		attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }

test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll

Show First 20 Lines • Show All 195 Lines • ▼ Show 20 Lines	for.body: ; preds = %for.body, %for.body.preheader
%incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1		%incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1
%inc = add nuw nsw i32 %i.09, 1		%inc = add nuw nsw i32 %i.09, 1
%cmp = icmp eq i32* %incdec.ptr, %call		%cmp = icmp eq i32* %incdec.ptr, %call
br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body		br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body
}		}

declare i32* @get_unknown_pointer() #0		declare i32* @get_unknown_pointer() #0

attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }		attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }

test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll

Show First 20 Lines • Show All 123 Lines • ▼ Show 20 Lines	bb:
%tmp2 = icmp eq i32 %arg1, 0		%tmp2 = icmp eq i32 %arg1, 0
%tmp3 = select i1 %tmp2, double* null, double* %tmp		%tmp3 = select i1 %tmp2, double* null, double* %tmp
store double 1.000000e+00, double* %tmp3, align 8		store double 1.000000e+00, double* %tmp3, align 8
%tmp4 = load double, double* %tmp, align 8		%tmp4 = load double, double* %tmp, align 8
store double %tmp4, double addrspace(1)* %arg		store double %tmp4, double addrspace(1)* %arg
ret void		ret void
}		}

attributes #0 = { norecurse nounwind "amdgpu-max-waves-per-eu"="1" }		attributes #0 = { norecurse nounwind "amdgpu-waves-per-eu"="1,1" }
attributes #1 = { norecurse nounwind }		attributes #1 = { norecurse nounwind }
No newline at end of file		No newline at end of file

test/CodeGen/AMDGPU/target-cpu.ll

Show First 20 Lines • Show All 102 Lines • ▼ Show 20 Lines	entry:
ret void		ret void
}		}

attributes #0 = { nounwind }		attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }		attributes #1 = { nounwind readnone }
attributes #2 = { nounwind "target-cpu"="tahiti" }		attributes #2 = { nounwind "target-cpu"="tahiti" }
attributes #3 = { nounwind "target-cpu"="bonaire" }		attributes #3 = { nounwind "target-cpu"="bonaire" }
attributes #4 = { nounwind "target-cpu"="fiji" }		attributes #4 = { nounwind "target-cpu"="fiji" }
attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-max-waves-per-eu"="3" }		attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-waves-per-eu"="1,3" }
attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-max-waves-per-eu"="3" }		attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-waves-per-eu"="1,3" }

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Wave and register controls
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 67801

lib/Target/AMDGPU/AMDGPUAsmPrinter.h

lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

lib/Target/AMDGPU/AMDGPUSubtarget.h

lib/Target/AMDGPU/AMDGPUSubtarget.cpp

lib/Target/AMDGPU/SIInstrInfo.cpp

lib/Target/AMDGPU/SIMachineFunctionInfo.h

lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

lib/Target/AMDGPU/SIRegisterInfo.h

lib/Target/AMDGPU/SIRegisterInfo.cpp

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

test/CodeGen/AMDGPU/amdgpu.private-memory.ll

test/CodeGen/AMDGPU/array-ptr-calc-i32.ll

test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll

test/CodeGen/AMDGPU/attr-amdgpu-num-active-waves-per-eu.ll

test/CodeGen/AMDGPU/attr-amdgpu-num-gpr.ll

test/CodeGen/AMDGPU/attr-unparseable.ll

test/CodeGen/AMDGPU/indirect-private-64.ll

test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll

test/CodeGen/AMDGPU/large-work-group-registers.ll

test/CodeGen/AMDGPU/load-constant-i16.ll

test/CodeGen/AMDGPU/load-local-i16.ll

test/CodeGen/AMDGPU/private-memory-r600.ll

test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll

test/CodeGen/AMDGPU/promote-alloca-no-opts.ll

test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll

test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll

test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll

test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll

test/CodeGen/AMDGPU/target-cpu.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Wave and register controlsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 67801

lib/Target/AMDGPU/AMDGPUAsmPrinter.h

lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

lib/Target/AMDGPU/AMDGPUSubtarget.h

lib/Target/AMDGPU/AMDGPUSubtarget.cpp

lib/Target/AMDGPU/SIInstrInfo.cpp

lib/Target/AMDGPU/SIMachineFunctionInfo.h

lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

lib/Target/AMDGPU/SIRegisterInfo.h

lib/Target/AMDGPU/SIRegisterInfo.cpp

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

test/CodeGen/AMDGPU/amdgpu.private-memory.ll

test/CodeGen/AMDGPU/array-ptr-calc-i32.ll

test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll

test/CodeGen/AMDGPU/attr-amdgpu-num-active-waves-per-eu.ll

test/CodeGen/AMDGPU/attr-amdgpu-num-gpr.ll

test/CodeGen/AMDGPU/attr-unparseable.ll

test/CodeGen/AMDGPU/indirect-private-64.ll

test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll

test/CodeGen/AMDGPU/large-work-group-registers.ll

test/CodeGen/AMDGPU/load-constant-i16.ll

test/CodeGen/AMDGPU/load-local-i16.ll

test/CodeGen/AMDGPU/private-memory-r600.ll

test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll

test/CodeGen/AMDGPU/promote-alloca-no-opts.ll

test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll

test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll

test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll

test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll

test/CodeGen/AMDGPU/target-cpu.ll

[AMDGPU] Wave and register controls
ClosedPublic