Diff 416750

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

Show All 24 Lines
// that fields can be elided based on more expensive analysis.		// that fields can be elided based on more expensive analysis.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "AMDGPU.h"		#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"		#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDGPUMemoryUtils.h"		#include "Utils/AMDGPUMemoryUtils.h"
#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"
		#include "llvm/Analysis/CallGraph.h"
#include "llvm/IR/Constants.h"		#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"		#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"		#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
#include "llvm/IR/MDBuilder.h"		#include "llvm/IR/MDBuilder.h"
#include "llvm/InitializePasses.h"		#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"		#include "llvm/Pass.h"
▲ Show 20 Lines • Show All 117 Lines • ▼ Show 20 Lines
public:		public:
static char ID;		static char ID;

AMDGPULowerModuleLDS() : ModulePass(ID) {		AMDGPULowerModuleLDS() : ModulePass(ID) {
initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());		initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
}		}

bool runOnModule(Module &M) override {		bool runOnModule(Module &M) override {
		CallGraph CG = CallGraph(M);
UsedList = getUsedList(M);		UsedList = getUsedList(M);
bool Changed = superAlignLDSGlobals(M);		bool Changed = superAlignLDSGlobals(M);
Changed \|= processUsedLDS(M);		Changed \|= processUsedLDS(CG, M);

for (Function &F : M.functions()) {		for (Function &F : M.functions()) {
if (F.isDeclaration())		if (F.isDeclaration())
continue;		continue;

// Only lower compute kernels' LDS.		// Only lower compute kernels' LDS.
if (!AMDGPU::isKernel(F.getCallingConv()))		if (!AMDGPU::isKernel(F.getCallingConv()))
continue;		continue;
Changed \|= processUsedLDS(M, &F);		Changed \|= processUsedLDS(CG, M, &F);
}		}

UsedList.clear();		UsedList.clear();
return Changed;		return Changed;
}		}

private:		private:
// Increase the alignment of LDS globals if necessary to maximise the chance		// Increase the alignment of LDS globals if necessary to maximise the chance
Show All 35 Lines	for (auto &GV : M.globals()) {
if (Alignment != AMDGPU::getAlign(DL, &GV)) {		if (Alignment != AMDGPU::getAlign(DL, &GV)) {
Changed = true;		Changed = true;
GV.setAlignment(Alignment);		GV.setAlignment(Alignment);
}		}
}		}
return Changed;		return Changed;
}		}

bool processUsedLDS(Module &M, Function *F = nullptr) {		bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) {
LLVMContext &Ctx = M.getContext();		LLVMContext &Ctx = M.getContext();
const DataLayout &DL = M.getDataLayout();		const DataLayout &DL = M.getDataLayout();

// Find variables to move into new struct instance		// Find variables to move into new struct instance
std::vector<GlobalVariable *> FoundLocalVars =		std::vector<GlobalVariable *> FoundLocalVars =
AMDGPU::findVariablesToLower(M, F);		AMDGPU::findVariablesToLower(M, F);

if (FoundLocalVars.empty()) {		if (FoundLocalVars.empty()) {
▲ Show 20 Lines • Show All 131 Lines • ▼ Show 20 Lines	bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) {

// This ensures the variable is allocated when called functions access it.		// This ensures the variable is allocated when called functions access it.
// It also lets other passes, specifically PromoteAlloca, accurately		// It also lets other passes, specifically PromoteAlloca, accurately
// calculate how much LDS will be used by the kernel after lowering.		// calculate how much LDS will be used by the kernel after lowering.
if (!F) {		if (!F) {
IRBuilder<> Builder(Ctx);		IRBuilder<> Builder(Ctx);
for (Function &Func : M.functions()) {		for (Function &Func : M.functions()) {
if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {		if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
		const CallGraphNode *N = CG[&Func];
		const bool CalleesRequireModuleLDS = N->size() > 0;
		arsenmUnsubmitted Not Done Reply Inline Actions I don't think this actually works for indirect calls (or even calls through aliases) arsenm: I don't think this actually works for indirect calls (or even calls through aliases)
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Indirect calls are represented as an edge from 'outside', I'll check that external edge is accounted under size. Indirect calls in general need careful handling, but this initial patch is only checking for any calls at all. JonChesterfield: Indirect calls are represented as an edge from 'outside', I'll check that external edge is…
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Yep, counted as expected. Indirect and normal calls both increment size by one. N calls to the same function increment by N. JonChesterfield: Yep, counted as expected. Indirect and normal calls both increment size by one. N calls to the…

		if (CalleesRequireModuleLDS) {
		// If a function this kernel might call requires module LDS,
		// annotate the kernel to let later passes know it will allocate
		// this structure, even if not apparent from the IR.
markUsedByKernel(Builder, &Func, SGV);		markUsedByKernel(Builder, &Func, SGV);
		} else {
		// However if we are certain this kernel cannot call a function that
		// requires module LDS, change the corresponding attribute from the
		// default of 'true' so the backend can elide it.
		Func.addFnAttr("amdgpu-requires-module-lds", "false");
		arsenmUnsubmitted Not Done Reply Inline Actions You don't need to have a value (no true), just set the attribute. I think all the bool-as-string variable are an antipattern that for some reason spread to a subset of attributes arsenm: You don't need to have a value (no true), just set the attribute. I think all the bool-as…
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Ah, that explains the getValueAsBool behaviour. Thanks JonChesterfield: Ah, that explains the getValueAsBool behaviour. Thanks
		}
}		}
}		}
}		}
return true;		return true;
}		}

void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,		void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
MDNode AliasScope, MDNode NoAlias,		MDNode AliasScope, MDNode NoAlias,
▲ Show 20 Lines • Show All 78 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Show First 20 Lines • Show All 50 Lines • ▼ Show 20 Lines	protected:
bool NoSignedZerosFPMath = false;		bool NoSignedZerosFPMath = false;

// Function may be memory bound.		// Function may be memory bound.
bool MemoryBound = false;		bool MemoryBound = false;

// Kernel may need limited waves per EU for better performance.		// Kernel may need limited waves per EU for better performance.
bool WaveLimiter = false;		bool WaveLimiter = false;

		// A function this kernel calls requires the module lds struct.
		bool CalleeRequiresModuleLDS = true;
		arsenmUnsubmitted Not Done Reply Inline Actions Probably not much value in caching this in MFI arsenm: Probably not much value in caching this in MFI
		arsenmUnsubmitted Not Done Reply Inline Actions I guess might as well leave it since we have the others (although I'm not a huge fan of these cached fields we don't serialize) arsenm: I guess might as well leave it since we have the others (although I'm not a huge fan of these…
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions It makes some WIP stuff cleaner to remove this, updating JonChesterfield: It makes some WIP stuff cleaner to remove this, updating

public:		public:
AMDGPUMachineFunction(const MachineFunction &MF);		AMDGPUMachineFunction(const MachineFunction &MF);

uint64_t getExplicitKernArgSize() const {		uint64_t getExplicitKernArgSize() const {
return ExplicitKernArgSize;		return ExplicitKernArgSize;
}		}

unsigned getMaxKernArgAlign() const { return MaxKernArgAlign.value(); }		unsigned getMaxKernArgAlign() const { return MaxKernArgAlign.value(); }
Show All 19 Lines	public:
bool isMemoryBound() const {		bool isMemoryBound() const {
return MemoryBound;		return MemoryBound;
}		}

bool needsWaveLimiter() const {		bool needsWaveLimiter() const {
return WaveLimiter;		return WaveLimiter;
}		}

		bool calleeRequiresModuleLDS() const { return CalleeRequiresModuleLDS; }

unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);		unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
void allocateModuleLDSGlobal(const Module *M);		void allocateModuleLDSGlobal(const Module *M);

Align getDynLDSAlign() const { return DynLDSAlign; }		Align getDynLDSAlign() const { return DynLDSAlign; }

void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);		void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);
};		};

}		}
#endif		#endif

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

Show All 26 Lines	AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
const Function &F = MF.getFunction();		const Function &F = MF.getFunction();

Attribute MemBoundAttr = F.getFnAttribute("amdgpu-memory-bound");		Attribute MemBoundAttr = F.getFnAttribute("amdgpu-memory-bound");
MemoryBound = MemBoundAttr.getValueAsBool();		MemoryBound = MemBoundAttr.getValueAsBool();

Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");		Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");
WaveLimiter = WaveLimitAttr.getValueAsBool();		WaveLimiter = WaveLimitAttr.getValueAsBool();

		const char *Attr = "amdgpu-requires-module-lds";
		if (F.hasFnAttribute(Attr))
		arsenmUnsubmitted Not Done Reply Inline Actions You can just do getFnAttribute, you don't need a second query arsenm: You can just do getFnAttribute, you don't need a second query
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions That returns 'false' for a missing attribute though, and I want the default state to be 'true' JonChesterfield: That returns 'false' for a missing attribute though, and I want the default state to be 'true'
		arsenmUnsubmitted Not Done Reply Inline Actions hasFnAttribute without the key value arsenm: hasFnAttribute without the key value
		CalleeRequiresModuleLDS = F.getFnAttribute(Attr).getValueAsBool();

CallingConv::ID CC = F.getCallingConv();		CallingConv::ID CC = F.getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL \|\| CC == CallingConv::SPIR_KERNEL)		if (CC == CallingConv::AMDGPU_KERNEL \|\| CC == CallingConv::SPIR_KERNEL)
ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);		ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
}		}

unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,		unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
const GlobalVariable &GV) {		const GlobalVariable &GV) {
auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0));		auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0));
Show All 14 Lines	unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
// Update the LDS size considering the padding to align the dynamic shared		// Update the LDS size considering the padding to align the dynamic shared
// memory.		// memory.
LDSSize = alignTo(StaticLDSSize, DynLDSAlign);		LDSSize = alignTo(StaticLDSSize, DynLDSAlign);

return Offset;		return Offset;
}		}

void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {		void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {
if (isModuleEntryFunction()) {		if (isModuleEntryFunction()) {
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Not sure following the existing pattern makes sense here. Passing the function to allocateModuleLDSGlobal then checking the attribute in place is probably better. JonChesterfield: Not sure following the existing pattern makes sense here. Passing the function to…
const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");		const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
if (GV) {		if (GV && calleeRequiresModuleLDS()) {
unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);		unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
(void)Offset;		(void)Offset;
assert(Offset == 0 &&		assert(Offset == 0 &&
"Module LDS expected to be allocated before other LDS");		"Module LDS expected to be allocated before other LDS");
}		}
}		}
}		}

Show All 12 Lines

llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll

Show All 15 Lines
;.		;.
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8		; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
; CHECK: @llvm.compiler.used = appending global [1 x i8] [i8 addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0, i32 0) to i8*)], section "llvm.metadata"		; CHECK: @llvm.compiler.used = appending global [1 x i8] [i8 addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0, i32 0) to i8*)], section "llvm.metadata"
; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 16		; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 16
; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16		; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16
; CHECK: @llvm.amdgcn.kernel..lds = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t undef, align 2		; CHECK: @llvm.amdgcn.kernel..lds = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t undef, align 2
; CHECK: @llvm.amdgcn.kernel..lds.1 = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t.0 undef, align 4		; CHECK: @llvm.amdgcn.kernel..lds.1 = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t.0 undef, align 4
;.		;.
define amdgpu_kernel void @k0() {		define amdgpu_kernel void @k0() #0 {
; CHECK-LABEL: @k0(		; CHECK-LABEL: @k0(
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*
; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8		; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4		; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)*
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16		; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16		; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
Show All 9 Lines	;
store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4		store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4

%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*		%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16		store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16

ret void		ret void
}		}

define amdgpu_kernel void @k1() {		define amdgpu_kernel void @k1() #0 {
; CHECK-LABEL: @k1(		; CHECK-LABEL: @k1(
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4		; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16		; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16		; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*		%lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2		store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2

%lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*		%lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4		store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4

%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*		%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16		store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16

ret void		ret void
}		}

define amdgpu_kernel void @0() {		define amdgpu_kernel void @0() #0 {
; CHECK-LABEL: @0(		; CHECK-LABEL: @0(
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t, %llvm.amdgcn.kernel..lds.t addrspace(3)* @llvm.amdgcn.kernel..lds, i32 0, i32 0) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t, %llvm.amdgcn.kernel..lds.t addrspace(3)* @llvm.amdgcn.kernel..lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2		; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*		%lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2		store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2

ret void		ret void
}		}

define amdgpu_kernel void @1() {		define amdgpu_kernel void @1() #0 {
; CHECK-LABEL: @1(		; CHECK-LABEL: @1(
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t.0, %llvm.amdgcn.kernel..lds.t.0 addrspace(3)* @llvm.amdgcn.kernel..lds.1, i32 0, i32 0) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t.0, %llvm.amdgcn.kernel..lds.t.0 addrspace(3)* @llvm.amdgcn.kernel..lds.1, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4		; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*		%lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4		store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4

ret void		ret void
Show All 10 Lines	;
%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*		%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1		store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1

%lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* @lds.size.8.align.8 to i8 addrspace(3)*		%lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* @lds.size.8.align.8 to i8 addrspace(3)*
store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 4		store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 4

ret void		ret void
}		}
;.
; CHECK: attributes #0 = { nofree nosync nounwind readnone willreturn }		attributes #0 = { "amdgpu-requires-module-lds"="false" }
;.		; CHECK: attributes #0 = { "amdgpu-requires-module-lds"="false" }

llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll

	Show All 29 Lines
	; CHECK-LABEL: @set_func(i32 %x)			; CHECK-LABEL: @set_func(i32 %x)
	; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64)) to i32*), align 4			; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64)) to i32*), align 4
	define void @set_func(i32 %x) local_unnamed_addr #1 {			define void @set_func(i32 %x) local_unnamed_addr #1 {
	entry:			entry:
	store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64)) to i32*), align 4			store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64)) to i32*), align 4
	ret void			ret void
	}			}

	; CHECK-LABEL: @timestwo()			; CHECK-LABEL: @timestwo() #0
	; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]			; CHECK-NOT: call void @llvm.donothing()
	; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*			; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
	; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32*			; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32*
	; CHECK: %3 = ptrtoint i32* %2 to i64			; CHECK: %3 = ptrtoint i32* %2 to i64
	; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64), %3			; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64), %3
	; CHECK: %5 = inttoptr i64 %4 to i32*			; CHECK: %5 = inttoptr i64 %4 to i32*
	; CHECK: %ld = load i32, i32* %5, align 4			; CHECK: %ld = load i32, i32* %5, align 4
	; CHECK: %mul = mul i32 %ld, 2			; CHECK: %mul = mul i32 %ld, 2
	; CHECK: %6 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*			; CHECK: %6 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
	; CHECK: %7 = addrspacecast i32 addrspace(3)* %6 to i32*			; CHECK: %7 = addrspacecast i32 addrspace(3)* %6 to i32*
	; CHECK: %8 = ptrtoint i32* %7 to i64			; CHECK: %8 = ptrtoint i32* %7 to i64
	; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64)			; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64)
	; CHECK: %10 = inttoptr i64 %9 to i32*			; CHECK: %10 = inttoptr i64 %9 to i32*
	; CHECK: store i32 %mul, i32* %10, align 4			; CHECK: store i32 %mul, i32* %10, align 4
	define amdgpu_kernel void @timestwo() {			define amdgpu_kernel void @timestwo() {
	%ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)) to i32) to i64)) to i32*), align 4			%ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)) to i32) to i64)) to i32*), align 4
	%mul = mul i32 %ld, 2			%mul = mul i32 %ld, 2
	store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64)) to i32*), align 4			store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64)) to i32*), align 4
	ret void			ret void
	}			}

				attributes #0 = { "amdgpu-requires-module-lds"="false" }
				; CHECK: attributes #0 = { "amdgpu-requires-module-lds"="false" }

llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s \| FileCheck -check-prefix=OPT %s			; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s \| FileCheck -check-prefix=OPT %s
	; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s \| FileCheck -check-prefix=OPT %s			; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s \| FileCheck -check-prefix=OPT %s
	; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

	; Check that module LDS is allocated at address 0 and kernel starts its			; Check that module LDS is allocated at address 0 and kernel starts its
	; allocation past module LDS.			; allocation past module LDS when a call is present.

	@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1			@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
	@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16			@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16

	; GCN-LABEL: {{^}}k0:			; GCN-LABEL: {{^}}k0:
	; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0			; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
	; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1			; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
	; GCN: ds_write_b8 [[NULL]], [[ONE]]			; GCN: ds_write_b8 [[NULL]], [[ONE]]
	; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2			; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
	; GCN: ds_write_b8 [[NULL]], [[TWO]] offset:16			; GCN: ds_write_b8 [[NULL]], [[TWO]] offset:16
	define amdgpu_kernel void @k0() {			define amdgpu_kernel void @k0() {
	; OPT-LABEL: @k0(			; OPT-LABEL: @k0(
	; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.]] addrspace(3) @llvm.amdgcn.module.lds) ]			; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.]] addrspace(3) @llvm.amdgcn.module.lds) ]
	; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.]] = bitcast [1 x i8] addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*			; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.]] = bitcast [1 x i8] addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
	; OPT-NEXT: store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1			; OPT-NEXT: store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
	; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.]] = bitcast [16 x i8] addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*			; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.]] = bitcast [16 x i8] addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
	; OPT-NEXT: store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16			; OPT-NEXT: store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16
				; OPT-NEXT: call void @f0()
	; OPT-NEXT: ret void			; OPT-NEXT: ret void
	;			;
	%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*			%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
	store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1			store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
	%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*			%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
	store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16			store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
				call void @f0()
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}f0:			; GCN-LABEL: {{^}}f0:
	; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0			; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
	; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3			; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
	; GCN: ds_write_b8 [[NULL]], [[TREE]]			; GCN: ds_write_b8 [[NULL]], [[TREE]]
	define void @f0() {			define void @f0() {
	; OPT-LABEL: @f0(			; OPT-LABEL: @f0() {
	; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.]] = bitcast [1 x i8] addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*			; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.]] = bitcast [1 x i8] addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
	; OPT-NEXT: store i8 3, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1			; OPT-NEXT: store i8 3, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
	; OPT-NEXT: ret void			; OPT-NEXT: ret void
	;			;
	%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*			%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
	store i8 3, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1			store i8 3, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
	ret void			ret void
	}			}

				; Without the function call, no module.lds.t at address zero is necessary so the
				; two variables are allocated in unspecified order. This is a weakness in current
				; codegen - the variable is moved into module.lds.t as a different function required
				; that, then module.lds.t is allocated as a normal variable. Coincidentally codegen
				; currently puts the higher alignment variable at zero. A later patch will avoid
				; moving variables into module.lds.t in kernels where that is not required, leaving
				; the variables to be optimally allocated as part of the kernel specific struct.
				; GCN-LABEL: {{^}}k1:
				; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
				; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
				; GCN: ds_write_b8 [[NULL]], [[ONE]] offset:16
				; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
				; GCN: ds_write_b8 [[NULL]], [[TWO]]
				define amdgpu_kernel void @k1() #0 {
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Sadly I think the ordering here is too unstable to hit from a test case, dropping it in the next diff. JonChesterfield: Sadly I think the ordering here is too unstable to hit from a test case, dropping it in the…
				; OPT-LABEL: @k1(
				; OPT-NOT: call void @llvm.donothing()
				; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.]] = bitcast [1 x i8] addrspace(3) getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
				; OPT-NEXT: store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
				; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.]] = bitcast [16 x i8] addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K1_LDS_T:%.]], [[LLVM_AMDGCN_KERNEL_K1_LDS_T]] addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)*
				; OPT-NEXT: store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16
				; OPT-NEXT: ret void
				;
				%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
				store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
				%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
				store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
				ret void
				}


				attributes #0 = { "amdgpu-requires-module-lds"="false" }
				; CHECK: attributes #0 = { "amdgpu-requires-module-lds"="false" }

llvm/test/CodeGen/AMDGPU/lower-module-lds.ll

	Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
	; CHECK: call void @func()			; CHECK: call void @func()
	; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 8			; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 8
	define amdgpu_kernel void @kern_call() {			define amdgpu_kernel void @kern_call() {
	call void @func()			call void @func()
	%dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic			%dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic
	ret void			ret void
	}			}

	; This kernel does not need to alloc the LDS block as it makes no calls			; This kernel does alloc the LDS block as it makes no calls
	; CHECK-LABEL: @kern_empty()			; CHECK-LABEL: @kern_empty()
	; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]			; CHECK-NOT: call void @llvm.donothing()
	define spir_kernel void @kern_empty() {			define spir_kernel void @kern_empty() #0{
	ret void			ret void
	}			}

	; Make sure we don't crash trying to insert code into a kernel			; Make sure we don't crash trying to insert code into a kernel
	; declaration.			; declaration.
	declare amdgpu_kernel void @kernel_declaration()			declare amdgpu_kernel void @kernel_declaration()

				attributes #0 = { "amdgpu-requires-module-lds"="false" }
				; CHECK: attributes #0 = { "amdgpu-requires-module-lds"="false" }

This is an archive of the discontinued LLVM Phabricator instance.

[amdgpu] Elide module lds allocation in kernels with no callees
ClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 416750

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll

llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll

llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll

llvm/test/CodeGen/AMDGPU/lower-module-lds.ll

This is an archive of the discontinued LLVM Phabricator instance.

[amdgpu] Elide module lds allocation in kernels with no calleesClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 416750

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll

llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll

llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll

llvm/test/CodeGen/AMDGPU/lower-module-lds.ll

[amdgpu] Elide module lds allocation in kernels with no callees
ClosedPublic