diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -72,6 +72,7 @@ FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *); ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *); FunctionPass *createAMDGPURewriteOutArgumentsPass(); +ModulePass *createAMDGPULowerFunctionLDSPass(); FunctionPass *createSIModeRegisterPass(); struct AMDGPUSimplifyLibCallsPass : PassInfoMixin { @@ -146,6 +147,13 @@ TargetMachine &TM; }; +void initializeAMDGPULowerFunctionLDSPass(PassRegistry &); +extern char &AMDGPULowerFunctionLDSID; + +struct AMDGPULowerFunctionLDSPass : PassInfoMixin { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerFunctionLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerFunctionLDSPass.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerFunctionLDSPass.cpp @@ -0,0 +1,340 @@ +//===-- AMDGPULowerFunctionLDSPass.cpp ----------------------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass eliminates LDS uses from non-kernel functions. +// +// The strategy is to create a new struct with a field for each LDS variable +// and allocate that struct at the same address for every kernel. Uses of the +// original LDS variables are then replaced with compile time offsets from that +// known address. AMDGPUMachineFunction allocates the LDS global. +// +// Local variables with constant annotation or non-undef initializer are passed +// through unchanged for simplication or error diagnostics in later passes. +// +// To reduce the memory overhead variables that are only used by kernels are +// excluded from this transform. The analysis to determine whether a variable +// is only used by a kernel is cheap and conservative so this may allocate +// a variable in every kernel when it was not strictly necessary to do so. +// +// A possible future refinement is to specialise the structure per-kernel, so +// that fields can be elided based on more expensive analysis. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include +#include + +#define DEBUG_TYPE "amdgpu-lower-function-lds" + +using namespace llvm; + +namespace { + +class AMDGPULowerFunctionLDS : public ModulePass { + + static bool isKernelCC(Function *Func) { + return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); + } + + static Align getAlign(DataLayout const &DL, const GlobalVariable *GV) { + return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL), + GV->getValueType()); + } + + static bool + useByKernelOrUsedList(const SmallPtrSetImpl &UsedList, + const Use &U, uint64_t Depth = 0) { + // Variables used by kernels or llvm.used are not moved into the new struct + User *V = U.getUser(); + + if (Depth > 4) { + return false; + } + if (auto *I = dyn_cast(V)) { + if (isKernelCC(I->getFunction())) { + return true; + } + } + if (auto *G = dyn_cast(V->stripPointerCasts())) { + if (UsedList.contains(G)) { + return true; + } + } + if (auto *E = dyn_cast(V)) { + return std::all_of(E->use_begin(), E->use_end(), [&](const Use &E) { + return useByKernelOrUsedList(UsedList, E, Depth + 1); + }); + } + return false; + } + + static std::vector + findVariablesToLower(Module &M, + const SmallPtrSetImpl &UsedList) { + std::vector LocalVars; + for (auto &GV : M.globals()) { + if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { + continue; + } + if (!GV.hasInitializer()) { + // addrspace(3) without initializer implies cuda/hip extern __shared__ + // the semantics for such a variable appears to be that all extern + // __shared__ variables alias one another, in which case this transform + // is not required + continue; + } + if (!isa(GV.getInitializer())) { + // Initializers are unimplemented for local address space. + // Leave such variables in place for consistent error reporting. + continue; + } + if (GV.isConstant()) { + // A constant undef variable can't be written to, and any load is + // undef, so it should be eliminated by the optimizer. It could be + // dropped by the back end if not. This pass skips over it. + continue; + } + if (std::all_of(GV.use_begin(), GV.use_end(), [&](const Use &U) { + return useByKernelOrUsedList(UsedList, U); + })) { + continue; + } + LocalVars.push_back(&GV); + } + return LocalVars; + } + + static void removeFromUsedList(Module &M, StringRef Name, + SmallPtrSetImpl &ToRemove) { + GlobalVariable *GV = M.getGlobalVariable(Name); + if (!GV || ToRemove.empty()) { + return; + } + + SmallVector Init; + auto *CA = cast(GV->getInitializer()); + for (auto &Op : CA->operands()) { + // ModuleUtils::appendToUsed only inserts Constants + Constant *C = cast(Op); + if (!ToRemove.contains(C->stripPointerCasts())) { + Init.push_back(C); + } + } + + if (Init.size() == CA->getNumOperands()) { + return; // none to remove + } + + GV->eraseFromParent(); + + if (!Init.empty()) { + ArrayType *ATy = + ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size()); + GV = + new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage, + ConstantArray::get(ATy, Init), Name); + GV->setSection("llvm.metadata"); + } + } + + static void + removeFromUsedLists(Module &M, + const std::vector &LocalVars) { + SmallPtrSet LocalVarsSet; + for (size_t I = 0; I < LocalVars.size(); I++) { + if (Constant *C = dyn_cast(LocalVars[I]->stripPointerCasts())) { + LocalVarsSet.insert(C); + } + } + removeFromUsedList(M, "llvm.used", LocalVarsSet); + removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet); + } + + static void markUsedByKernel(IRBuilder<> &Builder, Function *Func, + GlobalVariable *SGV) { + LLVMContext &Ctx = Func->getContext(); + Type *Ty = SGV->getValueType(); + + Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI()); + + FunctionType *FTy = FunctionType::get( + Type::getVoidTy(Ctx), {PointerType::get(Ty, AMDGPUAS::LOCAL_ADDRESS)}, + false); + + // Could use an intrinsic which is erased by the backend after LDS lowering + InlineAsm *IA = + InlineAsm::get(FTy, "// Alloc function lds block", "s", true); + Builder.CreateCall( + IA, {Builder.CreateInBoundsGEP( + Ty, SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))}); + } + +public: + static char ID; + + AMDGPULowerFunctionLDS() : ModulePass(ID) { + initializeAMDGPULowerFunctionLDSPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + LLVMContext &Ctx = M.getContext(); + DataLayout const &DL = M.getDataLayout(); + + SmallPtrSet UsedList; + collectUsedGlobalVariables(M, UsedList, true); + collectUsedGlobalVariables(M, UsedList, false); + + // Find variables to move into new struct instance + std::vector FoundLocalVars = + findVariablesToLower(M, UsedList); + + if (FoundLocalVars.empty()) { + // No variables to rewrite, no changes made. + return false; + } + + // Sort by alignment, descending, to minimise padding. + // On ties, sort by size, descending, then by name, lexicographical. + llvm::sort( + FoundLocalVars.begin(), FoundLocalVars.end(), + [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool { + Align ALHS = getAlign(DL, LHS); + Align ARHS = getAlign(DL, RHS); + if (ALHS != ARHS) { + return ALHS > ARHS; + } + + TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType()); + TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType()); + if (SLHS != SRHS) { + return SLHS > SRHS; + } + + // By variable name on tie for predictable order in test cases. + return LHS->getName() < RHS->getName(); + }); + + std::vector LocalVars; + LocalVars.reserve(FoundLocalVars.size()); // will be at least this large + { + // This usually won't need to insert any padding, perhaps avoid the alloc + uint64_t CurrentOffset = 0; + for (size_t I = 0; I < FoundLocalVars.size(); I++) { + GlobalVariable *FGV = FoundLocalVars[I]; + Align DataAlign = getAlign(DL, FGV); + + uint64_t DataAlignV = DataAlign.value(); + if (uint64_t Rem = CurrentOffset % DataAlignV) { + uint64_t Padding = DataAlignV - Rem; + + // Append an array of padding bytes to meet alignment requested + // Note (o + (a - (o % a)) ) % a == 0 + // (offset + Padding ) % align == 0 + + Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding); + LocalVars.push_back(new GlobalVariable( + M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), + "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, + false)); + CurrentOffset += Padding; + } + + LocalVars.push_back(FGV); + CurrentOffset += DL.getTypeAllocSize(FGV->getValueType()); + } + } + + std::vector LocalVarTypes; + LocalVarTypes.reserve(LocalVars.size()); + std::transform( + LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes), + [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); + + StructType *LDSTy = StructType::create( + Ctx, LocalVarTypes, llvm::StringRef("llvm.amdgcn.function.lds.t")); + + Align MaxAlign = getAlign(DL, LocalVars[0]); // was sorted on alignment + Constant *InstanceAddress = Constant::getIntegerValue( + PointerType::get(LDSTy, AMDGPUAS::LOCAL_ADDRESS), APInt(32, 0)); + + GlobalVariable *SGV = new GlobalVariable( + M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy), + "llvm.amdgcn.function.lds", nullptr, GlobalValue::NotThreadLocal, + AMDGPUAS::LOCAL_ADDRESS, false); + SGV->setAlignment(MaxAlign); + appendToCompilerUsed( + M, {static_cast( + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast(SGV), Type::getInt8PtrTy(Ctx)))}); + + // The verifier rejects used lists containing an inttoptr of a constant + // so remove the variables from these lists before replaceAllUsesWith + removeFromUsedLists(M, LocalVars); + + // Replace uses of ith variable with a constantexpr to the ith field of the + // instance that will be allocated by AMDGPUMachineFunction + Type *I32 = Type::getInt32Ty(Ctx); + for (size_t I = 0; I < LocalVars.size(); I++) { + GlobalVariable *GV = LocalVars[I]; + Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; + GV->replaceAllUsesWith( + ConstantExpr::getGetElementPtr(LDSTy, InstanceAddress, GEPIdx)); + GV->eraseFromParent(); + } + + // Mark kernels with asm that reads the address of the allocated structure + // This is not necessary for lowering. This lets other passes, specifically + // PromoteAlloca, accurately calculate how much LDS will be used by the + // kernel after lowering. + { + IRBuilder<> Builder(Ctx); + SmallPtrSet Kernels; + for (auto &I : M.functions()) { + Function *Func = &I; + if (isKernelCC(Func)) { + if (!Kernels.contains(Func)) { + markUsedByKernel(Builder, Func, SGV); + Kernels.insert(Func); + } + } + } + } + return true; + } +}; + +} // namespace +char AMDGPULowerFunctionLDS::ID = 0; + +char &llvm::AMDGPULowerFunctionLDSID = AMDGPULowerFunctionLDS::ID; + +INITIALIZE_PASS(AMDGPULowerFunctionLDS, DEBUG_TYPE, + "Lower uses of LDS variables from non-kernel functions", false, + false) + +ModulePass *llvm::createAMDGPULowerFunctionLDSPass() { + return new AMDGPULowerFunctionLDS(); +} + +PreservedAnalyses AMDGPULowerFunctionLDSPass::run(Module &M, + ModuleAnalysisManager &) { + return AMDGPULowerFunctionLDS().runOnModule(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -38,6 +38,12 @@ CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign); + + const Module *M = F.getParent(); + GlobalVariable *GV = M->getGlobalVariable("llvm.amdgcn.function.lds"); + if (GV) { + allocateLDSGlobal(M->getDataLayout(), *GV); + } } unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -126,8 +126,13 @@ char AMDGPUPromoteAlloca::ID = 0; char AMDGPUPromoteAllocaToVector::ID = 0; -INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE, - "AMDGPU promote alloca to vector or LDS", false, false) +INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE, + "AMDGPU promote alloca to vector or LDS", false, false) +// Move LDS uses from functions to kernels before promote alloca for accurate +// estimation of LDS available +INITIALIZE_PASS_DEPENDENCY(AMDGPULowerFunctionLDS) +INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE, + "AMDGPU promote alloca to vector or LDS", false, false) INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector", "AMDGPU promote alloca to vector", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -193,6 +193,11 @@ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); +static cl::opt + DisableLowerFunctionLDS("amdgpu-disable-lower-function-lds", cl::Hidden, + cl::desc("Disable lower function lds pass"), + cl::init(false)); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -234,6 +239,7 @@ initializeAMDGPULateCodeGenPreparePass(*PR); initializeAMDGPUPropagateAttributesEarlyPass(*PR); initializeAMDGPUPropagateAttributesLatePass(*PR); + initializeAMDGPULowerFunctionLDSPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); @@ -470,6 +476,10 @@ // and before other cleanup optimizations. PM.add(createAMDGPULowerKernelAttributesPass()); + if (!DisableLowerFunctionLDS) { + PM.add(createAMDGPULowerFunctionLDSPass()); + } + // Promote alloca to vector before SROA and loop unroll. If we manage // to eliminate allocas before unroll we may choose to unroll less. if (EnableOpt) @@ -502,6 +512,12 @@ PM.addPass(AMDGPUAlwaysInlinePass()); return true; } + if (PassName == "amdgpu-lower-function-lds") { + if (!DisableLowerFunctionLDS) { + PM.addPass(AMDGPULowerFunctionLDSPass()); + } + return true; + } return false; }); PB.registerPipelineParsingCallback( @@ -531,7 +547,6 @@ PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); return true; } - return false; }); @@ -876,6 +891,11 @@ // Replace OpenCL enqueued block function pointers with global variables. addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); + // Can increase LDS used by kernel so runs before PromoteAlloca + if (!DisableLowerFunctionLDS) { + addPass(createAMDGPULowerFunctionLDSPass()); + } + if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createInferAddressSpacesPass()); addPass(createAMDGPUPromoteAlloca()); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -72,6 +72,7 @@ AMDGPUMachineModuleInfo.cpp AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp + AMDGPULowerFunctionLDSPass.cpp AMDGPUOpenCLEnqueuedBlockLowering.cpp AMDGPUPostLegalizerCombiner.cpp AMDGPUPreLegalizerCombiner.cpp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-disable-lower-function-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s ; RUN: FileCheck -check-prefix=ERR %s < %t -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-function-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s ; RUN: FileCheck -check-prefix=ERR %s < %t @lds = internal addrspace(3) global float undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll @@ -1,4 +1,4 @@ -; RUN: not --crash llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s +; RUN: not --crash llc -march=amdgcn -verify-machineinstrs -amdgpu-disable-lower-function-lds=true < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; ERROR: LLVM ERROR: Unsupported expression in static initializer: addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*) diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - -amdgpu-disable-lower-function-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: FileCheck -check-prefix=ERR %s < %t -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - -amdgpu-disable-lower-function-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: FileCheck -check-prefix=ERR %s < %t @lds = internal addrspace(3) global float undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-function-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-function-lds-inactive.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-function-lds-inactive.ll @@ -0,0 +1,47 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-function-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-function-lds < %s | FileCheck %s + +; Variables that are not lowered by this pass are left unchanged +; CHECK-NOT: asm +; CHECK-NOT: llvm.amdgcn.function.lds +; CHECK-NOT: llvm.amdgcn.function.lds.t + +; var1, var2 would be transformed were they used from a non-kernel function +@var1 = addrspace(3) global i32 undef +@var2 = addrspace(3) global float undef + +; constant variables are left to the optimizer / error diagnostics +@const_undef = addrspace(3) constant i32 undef +@const_with_init = addrspace(3) constant i64 8 + +; Use of an addrspace(3) variable with an initializer is skipped, +; so as to preserve the unimplemented error from llc +@with_init = addrspace(3) global i64 0 + +; External and constant are both left to the optimizer / error diagnostics +@extern = external addrspace(3) global i32 + +; Only local addrspace variables are transformed +@addr4 = addrspace(4) global i64 undef + +; Use by .used lists doesn't trigger lowering +@llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +@llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" + +; Access from a function would cause lowering for non-excluded cases +define void @use_variables() { + %c0 = load i32, i32 addrspace(3)* @const_undef, align 4 + %c1 = load i64, i64 addrspace(3)* @const_with_init, align 4 + %v0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic + %v2 = atomicrmw add i32 addrspace(3)* @extern, i32 %c0 monotonic + %v1 = atomicrmw add i64 addrspace(4)* @addr4, i64 %c1 monotonic + ret void +} + +; Use by kernel doesn't trigger lowering +define amdgpu_kernel void @kern_use() { + %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic + call void @use_variables() + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/lower-function-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-function-lds-used-list.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-function-lds-used-list.ll @@ -0,0 +1,37 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-function-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-function-lds < %s | FileCheck %s + +; Check new struct is added to compiler.used and that the replaced variable is removed + +; CHECK: %llvm.amdgcn.function.lds.t = type { float } +; CHECK: @ignored = addrspace(1) global i64 0 +; CHECK: @llvm.amdgcn.function.lds = internal addrspace(3) global %llvm.amdgcn.function.lds.t undef, align 8 + +; CHECK-NOT: @tolower + +@tolower = addrspace(3) global float undef, align 8 + +; A variable that is unchanged by pass +@ignored = addrspace(1) global i64 0 + + +; @ignored still in list, @tolower removed, llvm.amdgcn.function.lds appended +; Start with one value to replace and one to ignore in the .use list + +; @ignored still in list, @tolower removed +; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata" + +@llvm.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata" + +; @ignored still in list, @tolower removed, llvm.amdgcn.function.lds appended +; CHECK: @llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.function.lds.t addrspace(3)* @llvm.amdgcn.function.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" + +@llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata" + +; CHECK-LABEL: @func() +; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0 +define void @func() { + %dec = atomicrmw fsub float addrspace(3)* @tolower, float 1.0 monotonic + %unused0 = atomicrmw add i64 addrspace(1)* @ignored, i64 1 monotonic + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-function-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-function-lds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-function-lds.ll @@ -0,0 +1,54 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-function-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-function-lds < %s | FileCheck %s + +; Padding to meet alignment, so references to @var1 replaced with gep ptr, 0, 2 +; No i64 as addrspace(3) types with initializers are ignored. Likewise no addrspace(4). +; CHECK: %llvm.amdgcn.function.lds.t = type { float, [4 x i8], i32 } + +; Variables removed by pass +; CHECK-NOT: @var0 +; CHECK-NOT: @var1 + +@var0 = addrspace(3) global float undef, align 8 +@var1 = addrspace(3) global i32 undef, align 8 + +; A variable that is unchanged by pass +; CHECK: @with_init = addrspace(3) global i64 0 +@with_init = addrspace(3) global i64 0 + +; Instance of new type, aligned to max of element alignment +; CHECK: @llvm.amdgcn.function.lds = internal addrspace(3) global %llvm.amdgcn.function.lds.t undef, align 8 + +; Use in func rewritten to access struct at address zero, which prints as null +; CHECK-LABEL: @func() +; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0 +; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr (%llvm.amdgcn.function.lds.t, %llvm.amdgcn.function.lds.t addrspace(3)* null, i32 0, i32 2), align 4 +; CHECK: %val1 = add i32 %val0, 4 +; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr (%llvm.amdgcn.function.lds.t, %llvm.amdgcn.function.lds.t addrspace(3)* null, i32 0, i32 2), align 4 +; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic +define void @func() { + %dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic + %val0 = load i32, i32 addrspace(3)* @var1, align 4 + %val1 = add i32 %val0, 4 + store i32 %val1, i32 addrspace(3)* @var1, align 4 + %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic + ret void +} + +; This kernel calls a function that uses LDS so needs the block +; CHECK-LABEL: @kern_call() +; CHECK: call void asm sideeffect "// Alloc function lds block", "s"(%llvm.amdgcn.function.lds.t addrspace(3)* @llvm.amdgcn.function.lds) +; CHECK: call void @func() +; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 2.0 +define amdgpu_kernel void @kern_call() { + call void @func() + %dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic + ret void +} + +; This kernel does not need to alloc the LDS block as it makes no calls +; CHECK-LABEL: @kern_empty() +; CHECK: call void asm sideeffect "// Alloc function lds block", "s"(%llvm.amdgcn.function.lds.t addrspace(3)* @llvm.amdgcn.function.lds) +define spir_kernel void @kern_empty() { + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll @@ -1,5 +1,5 @@ ; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s -; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=ASM %s +; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-function-lds=true < %s | FileCheck -check-prefix=ASM %s target datalayout = "A5"