Diff 427142

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Show First 20 Lines • Show All 492 Lines • ▼ Show 20 Lines	bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
MachineFunction &MF = B.getMF();		MachineFunction &MF = B.getMF();
const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();		const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();		MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();		SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();		const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const SITargetLowering &TLI = *getTLI<SITargetLowering>();		const SITargetLowering &TLI = *getTLI<SITargetLowering>();
const DataLayout &DL = F.getParent()->getDataLayout();		const DataLayout &DL = F.getParent()->getDataLayout();

Info->allocateModuleLDSGlobal(F.getParent());		Info->allocateModuleLDSGlobal(F);

SmallVector<CCValAssign, 16> ArgLocs;		SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());		CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());

allocateHSAUserSGPRs(CCInfo, B, MF, TRI, Info);		allocateHSAUserSGPRs(CCInfo, B, MF, TRI, Info);

unsigned i = 0;		unsigned i = 0;
const Align KernArgBaseAlign(16);		const Align KernArgBaseAlign(16);
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines	bool AMDGPUCallLowering::lowerFormalArguments(
MachineFunction &MF = B.getMF();		MachineFunction &MF = B.getMF();
MachineBasicBlock &MBB = B.getMBB();		MachineBasicBlock &MBB = B.getMBB();
MachineRegisterInfo &MRI = MF.getRegInfo();		MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();		SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();		const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();		const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
const DataLayout &DL = F.getParent()->getDataLayout();		const DataLayout &DL = F.getParent()->getDataLayout();

Info->allocateModuleLDSGlobal(F.getParent());		Info->allocateModuleLDSGlobal(F);

SmallVector<CCValAssign, 16> ArgLocs;		SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());		CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());

if (Info->hasImplicitBufferPtr()) {		if (Info->hasImplicitBufferPtr()) {
Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);		Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);		MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(ImplicitBufferPtrReg);		CCInfo.AllocateReg(ImplicitBufferPtrReg);
▲ Show 20 Lines • Show All 822 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

Show All 24 Lines
// that fields can be elided based on more expensive analysis.		// that fields can be elided based on more expensive analysis.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "AMDGPU.h"		#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"		#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDGPUMemoryUtils.h"		#include "Utils/AMDGPUMemoryUtils.h"
#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"
		#include "llvm/Analysis/CallGraph.h"
#include "llvm/IR/Constants.h"		#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"		#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"		#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
#include "llvm/IR/MDBuilder.h"		#include "llvm/IR/MDBuilder.h"
#include "llvm/InitializePasses.h"		#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"		#include "llvm/Pass.h"
▲ Show 20 Lines • Show All 117 Lines • ▼ Show 20 Lines
public:		public:
static char ID;		static char ID;

AMDGPULowerModuleLDS() : ModulePass(ID) {		AMDGPULowerModuleLDS() : ModulePass(ID) {
initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());		initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
}		}

bool runOnModule(Module &M) override {		bool runOnModule(Module &M) override {
		CallGraph CG = CallGraph(M);
UsedList = getUsedList(M);		UsedList = getUsedList(M);
bool Changed = superAlignLDSGlobals(M);		bool Changed = superAlignLDSGlobals(M);
Changed \|= processUsedLDS(M);		Changed \|= processUsedLDS(CG, M);

for (Function &F : M.functions()) {		for (Function &F : M.functions()) {
if (F.isDeclaration())		if (F.isDeclaration())
continue;		continue;

// Only lower compute kernels' LDS.		// Only lower compute kernels' LDS.
if (!AMDGPU::isKernel(F.getCallingConv()))		if (!AMDGPU::isKernel(F.getCallingConv()))
continue;		continue;
Changed \|= processUsedLDS(M, &F);		Changed \|= processUsedLDS(CG, M, &F);
}		}

UsedList.clear();		UsedList.clear();
return Changed;		return Changed;
}		}

private:		private:
// Increase the alignment of LDS globals if necessary to maximise the chance		// Increase the alignment of LDS globals if necessary to maximise the chance
Show All 35 Lines	for (auto &GV : M.globals()) {
if (Alignment != AMDGPU::getAlign(DL, &GV)) {		if (Alignment != AMDGPU::getAlign(DL, &GV)) {
Changed = true;		Changed = true;
GV.setAlignment(Alignment);		GV.setAlignment(Alignment);
}		}
}		}
return Changed;		return Changed;
}		}

bool processUsedLDS(Module &M, Function *F = nullptr) {		bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) {
LLVMContext &Ctx = M.getContext();		LLVMContext &Ctx = M.getContext();
const DataLayout &DL = M.getDataLayout();		const DataLayout &DL = M.getDataLayout();

// Find variables to move into new struct instance		// Find variables to move into new struct instance
std::vector<GlobalVariable *> FoundLocalVars =		std::vector<GlobalVariable *> FoundLocalVars =
AMDGPU::findVariablesToLower(M, F);		AMDGPU::findVariablesToLower(M, F);

if (FoundLocalVars.empty()) {		if (FoundLocalVars.empty()) {
▲ Show 20 Lines • Show All 131 Lines • ▼ Show 20 Lines	bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) {

// This ensures the variable is allocated when called functions access it.		// This ensures the variable is allocated when called functions access it.
// It also lets other passes, specifically PromoteAlloca, accurately		// It also lets other passes, specifically PromoteAlloca, accurately
// calculate how much LDS will be used by the kernel after lowering.		// calculate how much LDS will be used by the kernel after lowering.
if (!F) {		if (!F) {
IRBuilder<> Builder(Ctx);		IRBuilder<> Builder(Ctx);
for (Function &Func : M.functions()) {		for (Function &Func : M.functions()) {
if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {		if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
		const CallGraphNode *N = CG[&Func];
		const bool CalleesRequireModuleLDS = N->size() > 0;
		arsenmUnsubmitted Not Done Reply Inline Actions I don't think this actually works for indirect calls (or even calls through aliases) arsenm: I don't think this actually works for indirect calls (or even calls through aliases)
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Indirect calls are represented as an edge from 'outside', I'll check that external edge is accounted under size. Indirect calls in general need careful handling, but this initial patch is only checking for any calls at all. JonChesterfield: Indirect calls are represented as an edge from 'outside', I'll check that external edge is…
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Yep, counted as expected. Indirect and normal calls both increment size by one. N calls to the same function increment by N. JonChesterfield: Yep, counted as expected. Indirect and normal calls both increment size by one. N calls to the…

		if (CalleesRequireModuleLDS) {
		// If a function this kernel might call requires module LDS,
		// annotate the kernel to let later passes know it will allocate
		// this structure, even if not apparent from the IR.
markUsedByKernel(Builder, &Func, SGV);		markUsedByKernel(Builder, &Func, SGV);
		} else {
		// However if we are certain this kernel cannot call a function that
		// requires module LDS, annotate the kernel so the backend can elide
		// the allocation without repeating callgraph walks.
		Func.addFnAttr("amdgpu-elide-module-lds");
		arsenmUnsubmitted Not Done Reply Inline Actions You don't need to have a value (no true), just set the attribute. I think all the bool-as-string variable are an antipattern that for some reason spread to a subset of attributes arsenm: You don't need to have a value (no true), just set the attribute. I think all the bool-as…
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Ah, that explains the getValueAsBool behaviour. Thanks JonChesterfield: Ah, that explains the getValueAsBool behaviour. Thanks
		}
}		}
}		}
}		}
return true;		return true;
}		}

void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,		void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
MDNode AliasScope, MDNode NoAlias,		MDNode AliasScope, MDNode NoAlias,
▲ Show 20 Lines • Show All 78 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Show First 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	protected:

// Function may be memory bound.		// Function may be memory bound.
bool MemoryBound = false;		bool MemoryBound = false;

// Kernel may need limited waves per EU for better performance.		// Kernel may need limited waves per EU for better performance.
bool WaveLimiter = false;		bool WaveLimiter = false;

public:		public:
AMDGPUMachineFunction(const MachineFunction &MF);		AMDGPUMachineFunction(const MachineFunction &MF);
		arsenmUnsubmitted Not Done Reply Inline Actions Probably not much value in caching this in MFI arsenm: Probably not much value in caching this in MFI
		arsenmUnsubmitted Not Done Reply Inline Actions I guess might as well leave it since we have the others (although I'm not a huge fan of these cached fields we don't serialize) arsenm: I guess might as well leave it since we have the others (although I'm not a huge fan of these…
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions It makes some WIP stuff cleaner to remove this, updating JonChesterfield: It makes some WIP stuff cleaner to remove this, updating

uint64_t getExplicitKernArgSize() const {		uint64_t getExplicitKernArgSize() const {
return ExplicitKernArgSize;		return ExplicitKernArgSize;
}		}

unsigned getMaxKernArgAlign() const { return MaxKernArgAlign.value(); }		unsigned getMaxKernArgAlign() const { return MaxKernArgAlign.value(); }

uint32_t getLDSSize() const {		uint32_t getLDSSize() const {
Show All 22 Lines	bool isMemoryBound() const {
return MemoryBound;		return MemoryBound;
}		}

bool needsWaveLimiter() const {		bool needsWaveLimiter() const {
return WaveLimiter;		return WaveLimiter;
}		}

unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);		unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
void allocateModuleLDSGlobal(const Module *M);		void allocateModuleLDSGlobal(const Function &F);

Align getDynLDSAlign() const { return DynLDSAlign; }		Align getDynLDSAlign() const { return DynLDSAlign; }

void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);		void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);
};		};

}		}
#endif		#endif

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

Show All 28 Lines	AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)

Attribute MemBoundAttr = F.getFnAttribute("amdgpu-memory-bound");		Attribute MemBoundAttr = F.getFnAttribute("amdgpu-memory-bound");
MemoryBound = MemBoundAttr.getValueAsBool();		MemoryBound = MemBoundAttr.getValueAsBool();

Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");		Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");
WaveLimiter = WaveLimitAttr.getValueAsBool();		WaveLimiter = WaveLimitAttr.getValueAsBool();

// FIXME: How is this attribute supposed to interact with statically known		// FIXME: How is this attribute supposed to interact with statically known
// global sizes?		// global sizes?
		arsenmUnsubmitted Not Done Reply Inline Actions You can just do getFnAttribute, you don't need a second query arsenm: You can just do getFnAttribute, you don't need a second query
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions That returns 'false' for a missing attribute though, and I want the default state to be 'true' JonChesterfield: That returns 'false' for a missing attribute though, and I want the default state to be 'true'
		arsenmUnsubmitted Not Done Reply Inline Actions hasFnAttribute without the key value arsenm: hasFnAttribute without the key value
StringRef S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();		StringRef S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();
if (!S.empty())		if (!S.empty())
S.consumeInteger(0, GDSSize);		S.consumeInteger(0, GDSSize);

// Assume the attribute allocates before any known GDS globals.		// Assume the attribute allocates before any known GDS globals.
StaticGDSSize = GDSSize;		StaticGDSSize = GDSSize;

CallingConv::ID CC = F.getCallingConv();		CallingConv::ID CC = F.getCallingConv();
Show All 32 Lines	if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
// FIXME: Apply alignment of dynamic GDS		// FIXME: Apply alignment of dynamic GDS
GDSSize = StaticGDSSize;		GDSSize = StaticGDSSize;
}		}

Entry.first->second = Offset;		Entry.first->second = Offset;
return Offset;		return Offset;
}		}

void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {		// This kernel calls no functions that require the module lds struct
		static bool canElideModuleLDS(const Function &F) {
		return F.hasFnAttribute("amdgpu-elide-module-lds");
		}

		void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) {
		const Module *M = F.getParent();
if (isModuleEntryFunction()) {		if (isModuleEntryFunction()) {
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Not sure following the existing pattern makes sense here. Passing the function to allocateModuleLDSGlobal then checking the attribute in place is probably better. JonChesterfield: Not sure following the existing pattern makes sense here. Passing the function to…
const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");		const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
if (GV) {		if (GV && !canElideModuleLDS(F)) {
unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);		unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
(void)Offset;		(void)Offset;
assert(Offset == 0 &&		assert(Offset == 0 &&
"Module LDS expected to be allocated before other LDS");		"Module LDS expected to be allocated before other LDS");
}		}
}		}
}		}

Show All 12 Lines

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 2,441 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::LowerFormalArguments(

if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {		if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
DiagnosticInfoUnsupported NoGraphicsHSA(		DiagnosticInfoUnsupported NoGraphicsHSA(
Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());		Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
DAG.getContext()->diagnose(NoGraphicsHSA);		DAG.getContext()->diagnose(NoGraphicsHSA);
return DAG.getEntryNode();		return DAG.getEntryNode();
}		}

Info->allocateModuleLDSGlobal(Fn.getParent());		Info->allocateModuleLDSGlobal(Fn);

SmallVector<ISD::InputArg, 16> Splits;		SmallVector<ISD::InputArg, 16> Splits;
SmallVector<CCValAssign, 16> ArgLocs;		SmallVector<CCValAssign, 16> ArgLocs;
BitVector Skipped(Ins.size());		BitVector Skipped(Ins.size());
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,		CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());		*DAG.getContext());

bool IsGraphics = AMDGPU::isGraphics(CallConv);		bool IsGraphics = AMDGPU::isGraphics(CallConv);
▲ Show 20 Lines • Show All 10,269 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll

Show All 15 Lines
;.		;.
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8		; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
; CHECK: @llvm.compiler.used = appending global [1 x i8] [i8 addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0, i32 0) to i8*)], section "llvm.metadata"		; CHECK: @llvm.compiler.used = appending global [1 x i8] [i8 addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0, i32 0) to i8*)], section "llvm.metadata"
; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 16		; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 16
; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16		; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16
; CHECK: @llvm.amdgcn.kernel..lds = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t undef, align 2		; CHECK: @llvm.amdgcn.kernel..lds = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t undef, align 2
; CHECK: @llvm.amdgcn.kernel..lds.1 = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t.0 undef, align 4		; CHECK: @llvm.amdgcn.kernel..lds.1 = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t.0 undef, align 4
;.		;.
define amdgpu_kernel void @k0() {		define amdgpu_kernel void @k0() #0 {
; CHECK-LABEL: @k0(		; CHECK-LABEL: @k0(
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*
; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8		; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4		; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)*
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16		; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16		; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
Show All 9 Lines	;
store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4		store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4

%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*		%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16		store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16

ret void		ret void
}		}

define amdgpu_kernel void @k1() {		define amdgpu_kernel void @k1() #0 {
; CHECK-LABEL: @k1(		; CHECK-LABEL: @k1(
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4		; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16		; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16		; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*		%lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2		store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2

%lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*		%lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4		store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4

%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*		%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16		store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16

ret void		ret void
}		}

define amdgpu_kernel void @0() {		define amdgpu_kernel void @0() #0 {
; CHECK-LABEL: @0(		; CHECK-LABEL: @0(
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t, %llvm.amdgcn.kernel..lds.t addrspace(3)* @llvm.amdgcn.kernel..lds, i32 0, i32 0) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t, %llvm.amdgcn.kernel..lds.t addrspace(3)* @llvm.amdgcn.kernel..lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2		; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*		%lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2		store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2

ret void		ret void
}		}

define amdgpu_kernel void @1() {		define amdgpu_kernel void @1() #0 {
; CHECK-LABEL: @1(		; CHECK-LABEL: @1(
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t.0, %llvm.amdgcn.kernel..lds.t.0 addrspace(3)* @llvm.amdgcn.kernel..lds.1, i32 0, i32 0) to i8 addrspace(3)*		; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t.0, %llvm.amdgcn.kernel..lds.t.0 addrspace(3)* @llvm.amdgcn.kernel..lds.1, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4		; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
%lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*		%lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4		store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4

ret void		ret void
Show All 10 Lines	;
%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*		%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1		store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1

%lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* @lds.size.8.align.8 to i8 addrspace(3)*		%lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* @lds.size.8.align.8 to i8 addrspace(3)*
store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 4		store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 4

ret void		ret void
}		}
;.
; CHECK: attributes #0 = { nocallback nofree nosync nounwind readnone willreturn }		attributes #0 = { "amdgpu-elide-module-lds" }
;.		; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }

llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll

	Show All 29 Lines
	; CHECK-LABEL: @set_func(i32 %x)			; CHECK-LABEL: @set_func(i32 %x)
	; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64)) to i32*), align 4			; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64)) to i32*), align 4
	define void @set_func(i32 %x) local_unnamed_addr #1 {			define void @set_func(i32 %x) local_unnamed_addr #1 {
	entry:			entry:
	store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64)) to i32*), align 4			store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64)) to i32*), align 4
	ret void			ret void
	}			}

	; CHECK-LABEL: @timestwo()			; CHECK-LABEL: @timestwo() #0
	; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]			; CHECK-NOT: call void @llvm.donothing()
	; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*			; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
	; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32*			; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32*
	; CHECK: %3 = ptrtoint i32* %2 to i64			; CHECK: %3 = ptrtoint i32* %2 to i64
	; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64), %3			; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64), %3
	; CHECK: %5 = inttoptr i64 %4 to i32*			; CHECK: %5 = inttoptr i64 %4 to i32*
	; CHECK: %ld = load i32, i32* %5, align 4			; CHECK: %ld = load i32, i32* %5, align 4
	; CHECK: %mul = mul i32 %ld, 2			; CHECK: %mul = mul i32 %ld, 2
	; CHECK: %6 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*			; CHECK: %6 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
	; CHECK: %7 = addrspacecast i32 addrspace(3)* %6 to i32*			; CHECK: %7 = addrspacecast i32 addrspace(3)* %6 to i32*
	; CHECK: %8 = ptrtoint i32* %7 to i64			; CHECK: %8 = ptrtoint i32* %7 to i64
	; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64)			; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)) to i32) to i64)
	; CHECK: %10 = inttoptr i64 %9 to i32*			; CHECK: %10 = inttoptr i64 %9 to i32*
	; CHECK: store i32 %mul, i32* %10, align 4			; CHECK: store i32 %mul, i32* %10, align 4
	define amdgpu_kernel void @timestwo() {			define amdgpu_kernel void @timestwo() {
	%ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)) to i32) to i64)) to i32*), align 4			%ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)) to i32) to i64)) to i32*), align 4
	%mul = mul i32 %ld, 2			%mul = mul i32 %ld, 2
	store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64)) to i32*), align 4			store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)) to i32) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)) to i32) to i64)) to i32*), align 4
	ret void			ret void
	}			}

				attributes #0 = { "amdgpu-elide-module-lds" }
				; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }

llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s \| FileCheck -check-prefix=OPT %s			; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s \| FileCheck -check-prefix=OPT %s
	; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s \| FileCheck -check-prefix=OPT %s			; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s \| FileCheck -check-prefix=OPT %s
	; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

	; Check that module LDS is allocated at address 0 and kernel starts its			; Check that module LDS is allocated at address 0 and kernel starts its
	; allocation past module LDS.			; allocation past module LDS when a call is present.

	@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1			@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
	@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16			@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16

	; GCN-LABEL: {{^}}k0:			; GCN-LABEL: {{^}}k0:
	; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0			; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
	; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1			; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
	; GCN: ds_write_b8 [[NULL]], [[ONE]]			; GCN: ds_write_b8 [[NULL]], [[ONE]]
	; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2			; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
	; GCN: ds_write_b8 [[NULL]], [[TWO]] offset:16			; GCN: ds_write_b8 [[NULL]], [[TWO]] offset:16
	define amdgpu_kernel void @k0() {			define amdgpu_kernel void @k0() {
	; OPT-LABEL: @k0(			; OPT-LABEL: @k0(
	; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.]] addrspace(3) @llvm.amdgcn.module.lds) ]			; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.]] addrspace(3) @llvm.amdgcn.module.lds) ]
	; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.]] = bitcast [1 x i8] addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*			; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.]] = bitcast [1 x i8] addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
	; OPT-NEXT: store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1			; OPT-NEXT: store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
	; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.]] = bitcast [16 x i8] addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*			; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.]] = bitcast [16 x i8] addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
	; OPT-NEXT: store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16			; OPT-NEXT: store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16
				; OPT-NEXT: call void @f0()
	; OPT-NEXT: ret void			; OPT-NEXT: ret void
	;			;
	%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*			%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
	store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1			store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
	%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*			%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
	store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16			store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
				call void @f0()
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}f0:			; GCN-LABEL: {{^}}f0:
	; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0			; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
	; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3			; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
	; GCN: ds_write_b8 [[NULL]], [[TREE]]			; GCN: ds_write_b8 [[NULL]], [[TREE]]
	define void @f0() {			define void @f0() {
	; OPT-LABEL: @f0(			; OPT-LABEL: @f0() {
	; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.]] = bitcast [1 x i8] addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*			; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.]] = bitcast [1 x i8] addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
	; OPT-NEXT: store i8 3, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1			; OPT-NEXT: store i8 3, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
	; OPT-NEXT: ret void			; OPT-NEXT: ret void
	;			;
	%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*			%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
	store i8 3, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1			store i8 3, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
	ret void			ret void
	}			}

				attributes #0 = { "amdgpu-elide-module-lds" }
				; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }
				JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions Sadly I think the ordering here is too unstable to hit from a test case, dropping it in the next diff. JonChesterfield: Sadly I think the ordering here is too unstable to hit from a test case, dropping it in the…

llvm/test/CodeGen/AMDGPU/lower-module-lds.ll

	Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
	; CHECK: call void @func()			; CHECK: call void @func()
	; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 8			; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 8
	define amdgpu_kernel void @kern_call() {			define amdgpu_kernel void @kern_call() {
	call void @func()			call void @func()
	%dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic			%dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic
	ret void			ret void
	}			}

	; This kernel does not need to alloc the LDS block as it makes no calls			; This kernel does alloc the LDS block as it makes no calls
	; CHECK-LABEL: @kern_empty()			; CHECK-LABEL: @kern_empty()
	; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]			; CHECK-NOT: call void @llvm.donothing()
	define spir_kernel void @kern_empty() {			define spir_kernel void @kern_empty() #0{
	ret void			ret void
	}			}

	; Make sure we don't crash trying to insert code into a kernel			; Make sure we don't crash trying to insert code into a kernel
	; declaration.			; declaration.
	declare amdgpu_kernel void @kernel_declaration()			declare amdgpu_kernel void @kernel_declaration()

				attributes #0 = { "amdgpu-elide-module-lds" }
				; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }

This is an archive of the discontinued LLVM Phabricator instance.

[amdgpu] Elide module lds allocation in kernels with no callees
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 427142

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll

llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll

llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll

llvm/test/CodeGen/AMDGPU/lower-module-lds.ll

This is an archive of the discontinued LLVM Phabricator instance.

[amdgpu] Elide module lds allocation in kernels with no calleesClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 427142

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll

llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll

llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll

llvm/test/CodeGen/AMDGPU/lower-module-lds.ll

[amdgpu] Elide module lds allocation in kernels with no callees
ClosedPublic