Diff 463538

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Show First 20 Lines • Show All 492 Lines • ▼ Show 20 Lines	bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
MachineFunction &MF = B.getMF();		MachineFunction &MF = B.getMF();
const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();		const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();		MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();		SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();		const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const SITargetLowering &TLI = *getTLI<SITargetLowering>();		const SITargetLowering &TLI = *getTLI<SITargetLowering>();
const DataLayout &DL = F.getParent()->getDataLayout();		const DataLayout &DL = F.getParent()->getDataLayout();

Info->allocateModuleLDSGlobal(F);		Info->allocateKnownAddressLDSGlobal(F);

SmallVector<CCValAssign, 16> ArgLocs;		SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());		CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());

allocateHSAUserSGPRs(CCInfo, B, MF, TRI, Info);		allocateHSAUserSGPRs(CCInfo, B, MF, TRI, Info);

unsigned i = 0;		unsigned i = 0;
const Align KernArgBaseAlign(16);		const Align KernArgBaseAlign(16);
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines	bool AMDGPUCallLowering::lowerFormalArguments(
MachineFunction &MF = B.getMF();		MachineFunction &MF = B.getMF();
MachineBasicBlock &MBB = B.getMBB();		MachineBasicBlock &MBB = B.getMBB();
MachineRegisterInfo &MRI = MF.getRegInfo();		MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();		SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();		const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();		const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
const DataLayout &DL = F.getParent()->getDataLayout();		const DataLayout &DL = F.getParent()->getDataLayout();

Info->allocateModuleLDSGlobal(F);		Info->allocateKnownAddressLDSGlobal(F);

SmallVector<CCValAssign, 16> ArgLocs;		SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());		CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());

if (Info->hasImplicitBufferPtr()) {		if (Info->hasImplicitBufferPtr()) {
Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);		Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);		MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(ImplicitBufferPtrReg);		CCInfo.AllocateReg(ImplicitBufferPtrReg);
▲ Show 20 Lines • Show All 832 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Show First 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	public:
bool isMemoryBound() const {		bool isMemoryBound() const {
return MemoryBound;		return MemoryBound;
}		}

bool needsWaveLimiter() const {		bool needsWaveLimiter() const {
return WaveLimiter;		return WaveLimiter;
}		}

unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);		unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
void allocateModuleLDSGlobal(const Function &F);		return allocateLDSGlobal(DL, GV, DynLDSAlign);
		}
		unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV,
		Align Trailing);

		void allocateKnownAddressLDSGlobal(const Function &F);

		// A kernel function may have an associated LDS allocation, and a kernel-scope
		// LDS allocation must have an associated kernel function
		static const GlobalVariable *
		getKernelLDSGlobalFromFunction(const Function &F);

static Optional<uint32_t> getLDSKernelIdMetadata(const Function &F);		static Optional<uint32_t> getLDSKernelIdMetadata(const Function &F);

Align getDynLDSAlign() const { return DynLDSAlign; }		Align getDynLDSAlign() const { return DynLDSAlign; }

void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);		void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);
};		};

}		}
#endif		#endif

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

Show First 20 Lines • Show All 43 Lines • ▼ Show 20 Lines	AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
StaticGDSSize = GDSSize;		StaticGDSSize = GDSSize;

CallingConv::ID CC = F.getCallingConv();		CallingConv::ID CC = F.getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL \|\| CC == CallingConv::SPIR_KERNEL)		if (CC == CallingConv::AMDGPU_KERNEL \|\| CC == CallingConv::SPIR_KERNEL)
ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);		ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
}		}

unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,		unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
const GlobalVariable &GV) {		const GlobalVariable &GV,
		Align Trailing) {
auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0));		auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0));
if (!Entry.second)		if (!Entry.second)
return Entry.first->second;		return Entry.first->second;

Align Alignment =		Align Alignment =
DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());		DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());

unsigned Offset;		unsigned Offset;
if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {		if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
/// TODO: We should sort these to minimize wasted space due to alignment		/// TODO: We should sort these to minimize wasted space due to alignment
/// padding. Currently the padding is decided by the first encountered use		/// padding. Currently the padding is decided by the first encountered use
/// during lowering.		/// during lowering.
Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);		Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);

StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());		StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());

// Update the LDS size considering the padding to align the dynamic shared		// Align LDS size to trailing, e.g. for aligning dynamic shared memory
// memory.		LDSSize = alignTo(StaticLDSSize, Trailing);
LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
} else {		} else {
assert(GV.getAddressSpace() == AMDGPUAS::REGION_ADDRESS &&		assert(GV.getAddressSpace() == AMDGPUAS::REGION_ADDRESS &&
"expected region address space");		"expected region address space");

Offset = StaticGDSSize = alignTo(StaticGDSSize, Alignment);		Offset = StaticGDSSize = alignTo(StaticGDSSize, Alignment);
StaticGDSSize += DL.getTypeAllocSize(GV.getValueType());		StaticGDSSize += DL.getTypeAllocSize(GV.getValueType());

// FIXME: Apply alignment of dynamic GDS		// FIXME: Apply alignment of dynamic GDS
GDSSize = StaticGDSSize;		GDSSize = StaticGDSSize;
}		}

Entry.first->second = Offset;		Entry.first->second = Offset;
return Offset;		return Offset;
}		}

		const GlobalVariable *
		AMDGPUMachineFunction::getKernelLDSGlobalFromFunction(const Function &F) {
		const Module *M = F.getParent();
		std::string KernelLDSName = "llvm.amdgcn.kernel.";
		KernelLDSName += F.getName();
		arsenmUnsubmitted Not Done Reply Inline Actions Need to be careful about multiple anonymous functions arsenm: Need to be careful about multiple anonymous functions
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions How so? By the time they're functions in IR they have unique names as far as I know. If they don't, the current lowering is already broken for them as it assumes a struct can be uniquely associated with a function by deriving the variable name from the function name JonChesterfield: How so? By the time they're functions in IR they have unique names as far as I know. If they…
		arsenmUnsubmitted Not Done Reply Inline Actions They don't have to have a name at all. If you have kernels "@0" and "@1" they'll get the same thing here arsenm: They don't have to have a name at all. If you have kernels "@0" and "@1" they'll get the same…
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions That's exciting. Broken at present, AMDGPULowerModuleLDSPass creates the kernel-specific variable by passing `Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds"` to StructType::Create() with a suffix ".t". I'd be inclined to say the right fix for that is to hard error on them on the basis that anonymous kernels are difficult to invoke by name, but failing that we could assign them names in LowerModuleLDS. Or we could stop using string equivalence to bind a function and a struct together and do something along the lines of metadata instead which should be more robust to things like renaming the kernel. JonChesterfield: That's exciting. Broken at present, AMDGPULowerModuleLDSPass creates the kernel-specific…
		arsenmUnsubmitted Not Done Reply Inline Actions You could do what the final emission does, and use the name out of Mangler::getNameWithPrefix arsenm: You could do what the final emission does, and use the name out of Mangler::getNameWithPrefix
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Can I postpone fixing that until after this patch, given it's already broken independent of this patch? JonChesterfield: Can I postpone fixing that until after this patch, given it's already broken independent of…
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Mangler::getNameWithPrefix is a local state thing - different instances of Mangler will assign different values. We could value::setName the anonymous function but that'll be a change visible outside of LDS and feels inherently wrong. JonChesterfield: Mangler::getNameWithPrefix is a local state thing - different instances of Mangler will assign…
		KernelLDSName += ".lds";
		return M->getNamedGlobal(KernelLDSName);
		}

// This kernel calls no functions that require the module lds struct		// This kernel calls no functions that require the module lds struct
static bool canElideModuleLDS(const Function &F) {		static bool canElideModuleLDS(const Function &F) {
return F.hasFnAttribute("amdgpu-elide-module-lds");		return F.hasFnAttribute("amdgpu-elide-module-lds");
}		}

void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) {		void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) {
const Module *M = F.getParent();		const Module *M = F.getParent();

		// This function is called before allocating any other LDS so that it can
		// reliably put values at known addresses. Consequently, dynamic LDS, if
		// present, will not yet have been allocated

		assert(getDynLDSAlign() == Align() && "dynamic LDS not yet allocated");

if (isModuleEntryFunction()) {		if (isModuleEntryFunction()) {

		// Pointer values start from zero, memory allocated per-kernel-launch
		// Variables can be grouped into a module level struct and a struct per
		// kernel function by AMDGPULowerModuleLDSPass. If that is done, they
		// are allocated at statically computable addresses here.
		//
		// Address 0
		// {
		// llvm.amdgcn.module.lds
		// }
		// alignment padding
		// {
		// llvm.amdgcn.kernel.some-name.lds
		// }
		// other variables, e.g. dynamic lds, allocated after this call

const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");		const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
		const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F);

if (GV && !canElideModuleLDS(F)) {		if (GV && !canElideModuleLDS(F)) {
unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);		unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align());
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions this is both a negligible optimisation and decouples the layout from whether dynamic lds alignment is specified, which is useful for calculating where the variable is going to be from a context that doesn't know whether dynamic lds is in play JonChesterfield: this is both a negligible optimisation and decouples the layout from whether dynamic lds…
(void)Offset;		(void)Offset;
assert(Offset == 0 &&		assert(Offset == 0 &&
"Module LDS expected to be allocated before other LDS");		"Module LDS expected to be allocated before other LDS");
}		}

		if (KV) {
		// The per-kernel offset is deterministic because it is allocated
		// before any other non-module LDS variables.
		unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *KV, Align());
		(void)Offset;
		}
}		}
}		}

Optional<uint32_t>		Optional<uint32_t>
AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {		AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {
auto MD = F.getMetadata("llvm.amdgcn.lds.kernel.id");		auto MD = F.getMetadata("llvm.amdgcn.lds.kernel.id");
if (MD && MD->getNumOperands() == 1) {		if (MD && MD->getNumOperands() == 1) {
ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(0));		ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(0));
Show All 22 Lines

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 2,363 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::LowerFormalArguments(

if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {		if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
DiagnosticInfoUnsupported NoGraphicsHSA(		DiagnosticInfoUnsupported NoGraphicsHSA(
Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());		Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
DAG.getContext()->diagnose(NoGraphicsHSA);		DAG.getContext()->diagnose(NoGraphicsHSA);
return DAG.getEntryNode();		return DAG.getEntryNode();
}		}

Info->allocateModuleLDSGlobal(Fn);		Info->allocateKnownAddressLDSGlobal(Fn);

SmallVector<ISD::InputArg, 16> Splits;		SmallVector<ISD::InputArg, 16> Splits;
SmallVector<CCValAssign, 16> ArgLocs;		SmallVector<CCValAssign, 16> ArgLocs;
BitVector Skipped(Ins.size());		BitVector Skipped(Ins.size());
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,		CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());		*DAG.getContext());

bool IsGraphics = AMDGPU::isGraphics(CallConv);		bool IsGraphics = AMDGPU::isGraphics(CallConv);
▲ Show 20 Lines • Show All 10,616 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s \| FileCheck %s

				arsenmUnsubmitted Not Done Reply Inline Actions Should use -mtriple=amdgcn-amd-amdhsa, the checks are using the old style mesa relocations for stack arsenm: Should use -mtriple=amdgcn-amd-amdhsa, the checks are using the old style mesa relocations for…
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Nice, thanks. Dropping to gfx10 only as well, makes the test easier to read. JonChesterfield: Nice, thanks. Dropping to gfx10 only as well, makes the test easier to read.
				; LDS is allocated per-kernel. Module scope variables are gathered into a struct which is
				; allocated at address zero, if used by the kernel. Kernel scope variables are gathered into
				; a per-kernel struct and allocated immediately after the module scope.
				arsenmUnsubmitted Not Done Reply Inline Actions Is there a real reason to test this with all the targets? arsenm: Is there a real reason to test this with all the targets?
				; This test checks that the module and kernel scope variables are allocated in deterministic
				; order without spurious alignment padding between the two

				; External LDS is checked because it influences LDS padding in general and because it will
				; not be moved into either module or kernel struct

				@module_variable = addrspace(3) global i16 undef

				; Variables are allocated into module scope block when used by a non-kernel function
				define void @use_module() #0 {
				; CHECK-LABEL: use_module:
				; CHECK: ; %bb.0:
				; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
				; CHECK-NEXT: v_mov_b32_e32 v0, 0
				; CHECK-NEXT: ds_write_b16 v0, v0
				; CHECK-NEXT: s_waitcnt lgkmcnt(0)
				; CHECK-NEXT: s_setpc_b64 s[30:31]
				store i16 0, i16 addrspace(3)* @module_variable
				ret void
				}

				; Variables only used by kernels are specialised and allocated per-kernel
				@kernel_normal = addrspace(3) global i16 undef
				@kernel_overalign = addrspace(3) global i16 undef, align 4

				; External LDS shall not introduce padding between module and kernel scope variables
				@extern_normal = external addrspace(3) global [0 x float]
				@extern_overalign = external addrspace(3) global [0 x float], align 8

				; 2^3 cases encoded into function names

				define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) #1 {
				; CHECK-LABEL: module_0_kernel_normal_extern_normal:
				; CHECK: ; %bb.0:
				; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
				; CHECK-NEXT: v_mov_b32_e32 v0, 0
				; CHECK-NEXT: v_mov_b32_e32 v1, 2
				; CHECK-NEXT: s_waitcnt lgkmcnt(0)
				; CHECK-NEXT: s_lshl_b32 s0, s0, 2
				; CHECK-NEXT: s_add_i32 s0, s0, 4
				; CHECK-NEXT: v_mov_b32_e32 v2, s0
				; CHECK-NEXT: ds_write_b16 v0, v1
				; CHECK-NEXT: ds_write_b32 v2, v0
				; CHECK-NEXT: s_endpgm
				store i16 2, i16 addrspace(3)* @kernel_normal

				%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
				store float 0.0, float addrspace(3)* %arrayidx1
				ret void
				}

				define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
				; CHECK-LABEL: module_1_kernel_normal_extern_normal:
				; CHECK: ; %bb.0:
				; CHECK-NEXT: s_add_u32 s8, s8, s11
				; CHECK-NEXT: s_mov_b32 s32, 0
				; CHECK-NEXT: s_addc_u32 s9, s9, 0
				; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
				; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
				; CHECK-NEXT: s_add_u32 s0, s0, s11
				; CHECK-NEXT: s_addc_u32 s1, s1, 0
				; CHECK-NEXT: s_getpc_b64 s[8:9]
				; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4
				; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12
				; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
				; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
				; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
				; CHECK-NEXT: s_waitcnt lgkmcnt(0)
				; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
				; CHECK-NEXT: s_lshl_b32 s4, s12, 2
				; CHECK-NEXT: v_mov_b32_e32 v0, 0
				; CHECK-NEXT: v_mov_b32_e32 v1, 1
				; CHECK-NEXT: s_add_i32 s4, s4, 4
				; CHECK-NEXT: v_mov_b32_e32 v2, 2
				; CHECK-NEXT: v_mov_b32_e32 v3, s4
				; CHECK-NEXT: ds_write_b16 v0, v1
				; CHECK-NEXT: ds_write_b16 v0, v2 offset:2
				; CHECK-NEXT: ds_write_b32 v3, v0
				; CHECK-NEXT: s_endpgm
				call void @use_module()
				store i16 1, i16 addrspace(3)* @module_variable

				store i16 2, i16 addrspace(3)* @kernel_normal

				%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
				store float 0.0, float addrspace(3)* %arrayidx1
				ret void
				}

				define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) #1 {
				; CHECK-LABEL: module_0_kernel_overalign_extern_normal:
				; CHECK: ; %bb.0:
				; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
				; CHECK-NEXT: v_mov_b32_e32 v0, 0
				; CHECK-NEXT: v_mov_b32_e32 v1, 2
				; CHECK-NEXT: s_waitcnt lgkmcnt(0)
				; CHECK-NEXT: s_lshl_b32 s0, s0, 2
				; CHECK-NEXT: s_add_i32 s0, s0, 4
				; CHECK-NEXT: v_mov_b32_e32 v2, s0
				; CHECK-NEXT: ds_write_b16 v0, v1
				; CHECK-NEXT: ds_write_b32 v2, v0
				; CHECK-NEXT: s_endpgm
				store i16 2, i16 addrspace(3)* @kernel_overalign

				%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
				store float 0.0, float addrspace(3)* %arrayidx1
				ret void
				}

				define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
				; CHECK-LABEL: module_1_kernel_overalign_extern_normal:
				; CHECK: ; %bb.0:
				; CHECK-NEXT: s_add_u32 s8, s8, s11
				; CHECK-NEXT: s_mov_b32 s32, 0
				; CHECK-NEXT: s_addc_u32 s9, s9, 0
				; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
				; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
				; CHECK-NEXT: s_add_u32 s0, s0, s11
				; CHECK-NEXT: s_addc_u32 s1, s1, 0
				; CHECK-NEXT: s_getpc_b64 s[8:9]
				; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4
				; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12
				; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
				; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
				; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
				; CHECK-NEXT: s_waitcnt lgkmcnt(0)
				; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
				; CHECK-NEXT: s_lshl_b32 s4, s12, 2
				; CHECK-NEXT: v_mov_b32_e32 v0, 0
				; CHECK-NEXT: v_mov_b32_e32 v1, 1
				; CHECK-NEXT: s_add_i32 s4, s4, 8
				; CHECK-NEXT: v_mov_b32_e32 v2, 2
				; CHECK-NEXT: v_mov_b32_e32 v3, s4
				; CHECK-NEXT: ds_write_b16 v0, v1
				; CHECK-NEXT: ds_write_b16 v0, v2 offset:4
				; CHECK-NEXT: ds_write_b32 v3, v0
				; CHECK-NEXT: s_endpgm
				call void @use_module()
				store i16 1, i16 addrspace(3)* @module_variable

				store i16 2, i16 addrspace(3)* @kernel_overalign

				%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
				store float 0.0, float addrspace(3)* %arrayidx1
				ret void
				}

				define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) #1 {
				; CHECK-LABEL: module_0_kernel_normal_extern_overalign:
				; CHECK: ; %bb.0:
				; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
				; CHECK-NEXT: v_mov_b32_e32 v0, 0
				; CHECK-NEXT: v_mov_b32_e32 v1, 2
				; CHECK-NEXT: s_waitcnt lgkmcnt(0)
				; CHECK-NEXT: s_lshl_b32 s0, s0, 2
				; CHECK-NEXT: s_add_i32 s0, s0, 8
				; CHECK-NEXT: v_mov_b32_e32 v2, s0
				; CHECK-NEXT: ds_write_b16 v0, v1
				; CHECK-NEXT: ds_write_b32 v2, v0
				; CHECK-NEXT: s_endpgm
				store i16 2, i16 addrspace(3)* @kernel_normal

				%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
				store float 0.0, float addrspace(3)* %arrayidx1
				ret void
				}

				define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
				; CHECK-LABEL: module_1_kernel_normal_extern_overalign:
				; CHECK: ; %bb.0:
				; CHECK-NEXT: s_add_u32 s8, s8, s11
				; CHECK-NEXT: s_mov_b32 s32, 0
				; CHECK-NEXT: s_addc_u32 s9, s9, 0
				; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
				; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
				; CHECK-NEXT: s_add_u32 s0, s0, s11
				; CHECK-NEXT: s_addc_u32 s1, s1, 0
				; CHECK-NEXT: s_getpc_b64 s[8:9]
				; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4
				; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12
				; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
				; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
				; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
				; CHECK-NEXT: s_waitcnt lgkmcnt(0)
				; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
				; CHECK-NEXT: s_lshl_b32 s4, s12, 2
				; CHECK-NEXT: v_mov_b32_e32 v0, 0
				; CHECK-NEXT: v_mov_b32_e32 v1, 1
				; CHECK-NEXT: s_add_i32 s4, s4, 8
				; CHECK-NEXT: v_mov_b32_e32 v2, 2
				; CHECK-NEXT: v_mov_b32_e32 v3, s4
				; CHECK-NEXT: ds_write_b16 v0, v1
				; CHECK-NEXT: ds_write_b16 v0, v2 offset:2
				; CHECK-NEXT: ds_write_b32 v3, v0
				; CHECK-NEXT: s_endpgm
				call void @use_module()
				store i16 1, i16 addrspace(3)* @module_variable

				store i16 2, i16 addrspace(3)* @kernel_normal

				%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
				store float 0.0, float addrspace(3)* %arrayidx1
				ret void
				}

				define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) #1 {
				; CHECK-LABEL: module_0_kernel_overalign_extern_overalign:
				; CHECK: ; %bb.0:
				; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
				; CHECK-NEXT: v_mov_b32_e32 v0, 0
				; CHECK-NEXT: v_mov_b32_e32 v1, 2
				; CHECK-NEXT: s_waitcnt lgkmcnt(0)
				; CHECK-NEXT: s_lshl_b32 s0, s0, 2
				; CHECK-NEXT: s_add_i32 s0, s0, 8
				; CHECK-NEXT: v_mov_b32_e32 v2, s0
				; CHECK-NEXT: ds_write_b16 v0, v1
				; CHECK-NEXT: ds_write_b32 v2, v0
				; CHECK-NEXT: s_endpgm
				store i16 2, i16 addrspace(3)* @kernel_overalign

				%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
				store float 0.0, float addrspace(3)* %arrayidx1
				ret void
				}

				define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) {
				; CHECK-LABEL: module_1_kernel_overalign_extern_overalign:
				; CHECK: ; %bb.0:
				; CHECK-NEXT: s_add_u32 s8, s8, s11
				; CHECK-NEXT: s_mov_b32 s32, 0
				; CHECK-NEXT: s_addc_u32 s9, s9, 0
				; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
				; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
				; CHECK-NEXT: s_add_u32 s0, s0, s11
				; CHECK-NEXT: s_addc_u32 s1, s1, 0
				; CHECK-NEXT: s_getpc_b64 s[8:9]
				; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4
				; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12
				; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
				; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
				; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
				; CHECK-NEXT: s_waitcnt lgkmcnt(0)
				; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
				; CHECK-NEXT: s_lshl_b32 s4, s12, 2
				; CHECK-NEXT: v_mov_b32_e32 v0, 0
				; CHECK-NEXT: v_mov_b32_e32 v1, 1
				; CHECK-NEXT: s_add_i32 s4, s4, 8
				; CHECK-NEXT: v_mov_b32_e32 v2, 2
				; CHECK-NEXT: v_mov_b32_e32 v3, s4
				; CHECK-NEXT: ds_write_b16 v0, v1
				; CHECK-NEXT: ds_write_b16 v0, v2 offset:4
				; CHECK-NEXT: ds_write_b32 v3, v0
				; CHECK-NEXT: s_endpgm
				call void @use_module()
				store i16 1, i16 addrspace(3)* @module_variable

				store i16 2, i16 addrspace(3)* @kernel_overalign

				%arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
				store float 0.0, float addrspace(3)* %arrayidx1
				ret void
				}

				attributes #0 = { noinline }
				attributes #1 = { "amdgpu-elide-module-lds" }

This is an archive of the discontinued LLVM Phabricator instance.

[amdgpu][nfc] Allocate kernel-specific LDS struct deterministically
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 463538

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll

This is an archive of the discontinued LLVM Phabricator instance.

[amdgpu][nfc] Allocate kernel-specific LDS struct deterministicallyClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 463538

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll

[amdgpu][nfc] Allocate kernel-specific LDS struct deterministically
ClosedPublic