diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -71,6 +71,7 @@ FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *); ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *); FunctionPass *createAMDGPURewriteOutArgumentsPass(); +ModulePass *createAMDGPULowerModuleLDSPass(); FunctionPass *createSIModeRegisterPass(); struct AMDGPUSimplifyLibCallsPass : PassInfoMixin { @@ -145,6 +146,13 @@ TargetMachine &TM; }; +void initializeAMDGPULowerModuleLDSPass(PassRegistry &); +extern char &AMDGPULowerModuleLDSID; + +struct AMDGPULowerModuleLDSPass : PassInfoMixin { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -500,9 +500,10 @@ SIMachineFunctionInfo *Info = MF.getInfo(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); const SITargetLowering &TLI = *getTLI(); - const DataLayout &DL = F.getParent()->getDataLayout(); + Info->allocateModuleLDSGlobal(F.getParent()); + SmallVector ArgLocs; CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); @@ -591,6 +592,7 @@ const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); const DataLayout &DL = F.getParent()->getDataLayout(); + Info->allocateModuleLDSGlobal(F.getParent()); SmallVector ArgLocs; CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -0,0 +1,380 @@ +//===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass eliminates LDS uses from non-kernel functions. +// +// The strategy is to create a new struct with a field for each LDS variable +// and allocate that struct at the same address for every kernel. Uses of the +// original LDS variables are then replaced with compile time offsets from that +// known address. AMDGPUMachineFunction allocates the LDS global. +// +// Local variables with constant annotation or non-undef initializer are passed +// through unchanged for simplication or error diagnostics in later passes. +// +// To reduce the memory overhead variables that are only used by kernels are +// excluded from this transform. The analysis to determine whether a variable +// is only used by a kernel is cheap and conservative so this may allocate +// a variable in every kernel when it was not strictly necessary to do so. +// +// A possible future refinement is to specialise the structure per-kernel, so +// that fields can be elided based on more expensive analysis. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include +#include + +#define DEBUG_TYPE "amdgpu-lower-module-lds" + +using namespace llvm; + +namespace { + +class AMDGPULowerModuleLDS : public ModulePass { + + static bool isKernelCC(Function *Func) { + return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); + } + + static Align getAlign(DataLayout const &DL, const GlobalVariable *GV) { + return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL), + GV->getValueType()); + } + + static bool + userRequiresLowering(const SmallPtrSetImpl &UsedList, + User *InitialUser) { + // Any LDS variable can be lowered by moving into the created struct + // Each variable so lowered is allocated in every kernel, so variables + // whose users are all known to be safe to lower without the transform + // are left unchanged. + SmallPtrSet Visited; + SmallVector Stack; + Stack.push_back(InitialUser); + + while (!Stack.empty()) { + User *V = Stack.pop_back_val(); + Visited.insert(V); + + if (auto *G = dyn_cast(V->stripPointerCasts())) { + if (UsedList.contains(G)) { + continue; + } + } + + if (auto *I = dyn_cast(V)) { + if (isKernelCC(I->getFunction())) { + continue; + } + } + + if (auto *E = dyn_cast(V)) { + for (Value::user_iterator EU = E->user_begin(); EU != E->user_end(); + ++EU) { + if (Visited.insert(*EU).second) { + Stack.push_back(*EU); + } + } + continue; + } + + // Unknown user, conservatively lower the variable + return true; + } + + return false; + } + + static std::vector + findVariablesToLower(Module &M, + const SmallPtrSetImpl &UsedList) { + std::vector LocalVars; + for (auto &GV : M.globals()) { + if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { + continue; + } + if (!GV.hasInitializer()) { + // addrspace(3) without initializer implies cuda/hip extern __shared__ + // the semantics for such a variable appears to be that all extern + // __shared__ variables alias one another, in which case this transform + // is not required + continue; + } + if (!isa(GV.getInitializer())) { + // Initializers are unimplemented for local address space. + // Leave such variables in place for consistent error reporting. + continue; + } + if (GV.isConstant()) { + // A constant undef variable can't be written to, and any load is + // undef, so it should be eliminated by the optimizer. It could be + // dropped by the back end if not. This pass skips over it. + continue; + } + if (std::none_of(GV.user_begin(), GV.user_end(), [&](User *U) { + return userRequiresLowering(UsedList, U); + })) { + continue; + } + LocalVars.push_back(&GV); + } + return LocalVars; + } + + static void removeFromUsedList(Module &M, StringRef Name, + SmallPtrSetImpl &ToRemove) { + GlobalVariable *GV = M.getGlobalVariable(Name); + if (!GV || ToRemove.empty()) { + return; + } + + SmallVector Init; + auto *CA = cast(GV->getInitializer()); + for (auto &Op : CA->operands()) { + // ModuleUtils::appendToUsed only inserts Constants + Constant *C = cast(Op); + if (!ToRemove.contains(C->stripPointerCasts())) { + Init.push_back(C); + } + } + + if (Init.size() == CA->getNumOperands()) { + return; // none to remove + } + + GV->eraseFromParent(); + + if (!Init.empty()) { + ArrayType *ATy = + ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size()); + GV = + new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage, + ConstantArray::get(ATy, Init), Name); + GV->setSection("llvm.metadata"); + } + } + + static void + removeFromUsedLists(Module &M, + const std::vector &LocalVars) { + SmallPtrSet LocalVarsSet; + for (size_t I = 0; I < LocalVars.size(); I++) { + if (Constant *C = dyn_cast(LocalVars[I]->stripPointerCasts())) { + LocalVarsSet.insert(C); + } + } + removeFromUsedList(M, "llvm.used", LocalVarsSet); + removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet); + } + + static void markUsedByKernel(IRBuilder<> &Builder, Function *Func, + GlobalVariable *SGV) { + // The llvm.amdgcn.module.lds instance is implicitly used by all kernels + // that might call a function which accesses a field within it. This is + // presently approximated to 'all kernels' if there are any such functions + // in the module. This implicit use is reified as an explicit use here so + // that later passes, specifically PromoteAlloca, account for the required + // memory without any knowledge of this transform. + + // An operand bundle on llvm.donothing works because the call instruction + // survives until after the last pass that needs to account for LDS. It is + // better than inline asm as the latter survives until the end of codegen. A + // totally robust solution would be a function with the same semantics as + // llvm.donothing that takes a pointer to the instance and is lowered to a + // no-op after LDS is allocated, but that is not presently necessary. + + LLVMContext &Ctx = Func->getContext(); + + Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI()); + + FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {}); + + Function *Decl = + Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {}); + + Value *UseInstance[1] = {Builder.CreateInBoundsGEP( + SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))}; + + Builder.CreateCall(FTy, Decl, {}, + {OperandBundleDefT("ExplicitUse", UseInstance)}, + ""); + } + + static SmallPtrSet getUsedList(Module &M) { + SmallPtrSet UsedList; + + SmallVector TmpVec; + collectUsedGlobalVariables(M, TmpVec, true); + UsedList.insert(TmpVec.begin(), TmpVec.end()); + + TmpVec.clear(); + collectUsedGlobalVariables(M, TmpVec, false); + UsedList.insert(TmpVec.begin(), TmpVec.end()); + + return UsedList; + } + +public: + static char ID; + + AMDGPULowerModuleLDS() : ModulePass(ID) { + initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + SmallPtrSet UsedList = getUsedList(M); + + // Find variables to move into new struct instance + std::vector FoundLocalVars = + findVariablesToLower(M, UsedList); + + if (FoundLocalVars.empty()) { + // No variables to rewrite, no changes made. + return false; + } + + // Sort by alignment, descending, to minimise padding. + // On ties, sort by size, descending, then by name, lexicographical. + llvm::stable_sort( + FoundLocalVars, + [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool { + Align ALHS = getAlign(DL, LHS); + Align ARHS = getAlign(DL, RHS); + if (ALHS != ARHS) { + return ALHS > ARHS; + } + + TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType()); + TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType()); + if (SLHS != SRHS) { + return SLHS > SRHS; + } + + // By variable name on tie for predictable order in test cases. + return LHS->getName() < RHS->getName(); + }); + + std::vector LocalVars; + LocalVars.reserve(FoundLocalVars.size()); // will be at least this large + { + // This usually won't need to insert any padding, perhaps avoid the alloc + uint64_t CurrentOffset = 0; + for (size_t I = 0; I < FoundLocalVars.size(); I++) { + GlobalVariable *FGV = FoundLocalVars[I]; + Align DataAlign = getAlign(DL, FGV); + + uint64_t DataAlignV = DataAlign.value(); + if (uint64_t Rem = CurrentOffset % DataAlignV) { + uint64_t Padding = DataAlignV - Rem; + + // Append an array of padding bytes to meet alignment requested + // Note (o + (a - (o % a)) ) % a == 0 + // (offset + Padding ) % align == 0 + + Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding); + LocalVars.push_back(new GlobalVariable( + M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), + "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, + false)); + CurrentOffset += Padding; + } + + LocalVars.push_back(FGV); + CurrentOffset += DL.getTypeAllocSize(FGV->getValueType()); + } + } + + std::vector LocalVarTypes; + LocalVarTypes.reserve(LocalVars.size()); + std::transform( + LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes), + [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); + + StructType *LDSTy = StructType::create( + Ctx, LocalVarTypes, llvm::StringRef("llvm.amdgcn.module.lds.t")); + + Align MaxAlign = getAlign(DL, LocalVars[0]); // was sorted on alignment + Constant *InstanceAddress = Constant::getIntegerValue( + PointerType::get(LDSTy, AMDGPUAS::LOCAL_ADDRESS), APInt(32, 0)); + + GlobalVariable *SGV = new GlobalVariable( + M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy), + "llvm.amdgcn.module.lds", nullptr, GlobalValue::NotThreadLocal, + AMDGPUAS::LOCAL_ADDRESS, false); + SGV->setAlignment(MaxAlign); + appendToCompilerUsed( + M, {static_cast( + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast(SGV), Type::getInt8PtrTy(Ctx)))}); + + // The verifier rejects used lists containing an inttoptr of a constant + // so remove the variables from these lists before replaceAllUsesWith + removeFromUsedLists(M, LocalVars); + + // Replace uses of ith variable with a constantexpr to the ith field of the + // instance that will be allocated by AMDGPUMachineFunction + Type *I32 = Type::getInt32Ty(Ctx); + for (size_t I = 0; I < LocalVars.size(); I++) { + GlobalVariable *GV = LocalVars[I]; + Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; + GV->replaceAllUsesWith( + ConstantExpr::getGetElementPtr(LDSTy, InstanceAddress, GEPIdx)); + GV->eraseFromParent(); + } + + // Mark kernels with asm that reads the address of the allocated structure + // This is not necessary for lowering. This lets other passes, specifically + // PromoteAlloca, accurately calculate how much LDS will be used by the + // kernel after lowering. + { + IRBuilder<> Builder(Ctx); + SmallPtrSet Kernels; + for (auto &I : M.functions()) { + Function *Func = &I; + if (isKernelCC(Func) && !Kernels.contains(Func)) { + markUsedByKernel(Builder, Func, SGV); + Kernels.insert(Func); + } + } + } + return true; + } +}; + +} // namespace +char AMDGPULowerModuleLDS::ID = 0; + +char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID; + +INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE, + "Lower uses of LDS variables from non-kernel functions", false, + false) + +ModulePass *llvm::createAMDGPULowerModuleLDSPass() { + return new AMDGPULowerModuleLDS(); +} + +PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M, + ModuleAnalysisManager &) { + return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -94,6 +94,7 @@ } unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV); + void allocateModuleLDSGlobal(const Module *M); Align getDynLDSAlign() const { return DynLDSAlign; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -64,6 +64,18 @@ return Offset; } +void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) { + if (isModuleEntryFunction()) { + GlobalVariable *GV = M->getGlobalVariable("llvm.amdgcn.module.lds"); + if (GV) { + unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV); + (void)Offset; + assert(Offset == 0 && + "Module LDS expected to be allocated before other LDS"); + } + } +} + void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV) { assert(DL.getTypeAllocSize(GV.getValueType()).isZero()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -126,8 +126,13 @@ char AMDGPUPromoteAlloca::ID = 0; char AMDGPUPromoteAllocaToVector::ID = 0; -INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE, - "AMDGPU promote alloca to vector or LDS", false, false) +INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE, + "AMDGPU promote alloca to vector or LDS", false, false) +// Move LDS uses from functions to kernels before promote alloca for accurate +// estimation of LDS available +INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS) +INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE, + "AMDGPU promote alloca to vector or LDS", false, false) INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector", "AMDGPU promote alloca to vector", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -193,6 +193,11 @@ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); +static cl::opt + DisableLowerModuleLDS("amdgpu-disable-lower-module-lds", cl::Hidden, + cl::desc("Disable lower module lds pass"), + cl::init(false)); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -235,6 +240,7 @@ initializeAMDGPULateCodeGenPreparePass(*PR); initializeAMDGPUPropagateAttributesEarlyPass(*PR); initializeAMDGPUPropagateAttributesLatePass(*PR); + initializeAMDGPULowerModuleLDSPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); @@ -506,6 +512,10 @@ PM.addPass(AMDGPUAlwaysInlinePass()); return true; } + if (PassName == "amdgpu-lower-module-lds") { + PM.addPass(AMDGPULowerModuleLDSPass()); + return true; + } return false; }); PB.registerPipelineParsingCallback( @@ -535,7 +545,6 @@ PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); return true; } - return false; }); @@ -884,6 +893,10 @@ // Replace OpenCL enqueued block function pointers with global variables. addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); + // Can increase LDS used by kernel so runs before PromoteAlloca + if (!DisableLowerModuleLDS) + addPass(createAMDGPULowerModuleLDSPass()); + if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createInferAddressSpacesPass()); addPass(createAMDGPUPromoteAlloca()); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -67,6 +67,7 @@ AMDGPULowerIntrinsics.cpp AMDGPULowerKernelArguments.cpp AMDGPULowerKernelAttributes.cpp + AMDGPULowerModuleLDSPass.cpp AMDGPUMachineCFGStructurizer.cpp AMDGPUMachineFunction.cpp AMDGPUMachineModuleInfo.cpp diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2263,6 +2263,8 @@ return DAG.getEntryNode(); } + Info->allocateModuleLDSGlobal(Fn.getParent()); + SmallVector Splits; SmallVector ArgLocs; BitVector Skipped(Ins.size()); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-disable-lower-module-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s ; RUN: FileCheck -check-prefix=ERR %s < %t -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-module-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s ; RUN: FileCheck -check-prefix=ERR %s < %t @lds = internal addrspace(3) global float undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll @@ -1,4 +1,4 @@ -; RUN: not --crash llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s +; RUN: not --crash llc -march=amdgcn -verify-machineinstrs -amdgpu-disable-lower-module-lds=true < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; ERROR: LLVM ERROR: Unsupported expression in static initializer: addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*) diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - -amdgpu-disable-lower-module-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: FileCheck -check-prefix=ERR %s < %t -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - -amdgpu-disable-lower-module-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: FileCheck -check-prefix=ERR %s < %t @lds = internal addrspace(3) global float undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll @@ -0,0 +1,47 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; CHECK: %llvm.amdgcn.module.lds.t = type { float, float } + +@func = addrspace(3) global float undef, align 4 + +; @kern is only used from a kernel so it is left unchanged +; CHECK: @kern = addrspace(3) global float undef, align 4 +@kern = addrspace(3) global float undef, align 4 + +; @func is only used from a non-kernel function so is rewritten +; CHECK-NOT: @func +; @both is used from a non-kernel function so is rewritten +; CHECK-NOT: @both +; sorted both < func, so @both at null and @func at 4 +@both = addrspace(3) global float undef, align 4 + +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4 + +; CHECK-LABEL: @get_func() +; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 +define i32 @get_func() local_unnamed_addr #0 { +entry: + %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret i32 %0 +} + +; CHECK-LABEL: @set_func(i32 %x) +; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4 +define void @set_func(i32 %x) local_unnamed_addr #1 { +entry: + store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret void +} + +; CHECK-LABEL: @timestwo() +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 +; CHECK: %mul = mul i32 %ld, 2 +; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4 +define amdgpu_kernel void @timestwo() { + %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + %mul = mul i32 %ld, 2 + store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll @@ -0,0 +1,68 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; Variables that are not lowered by this pass are left unchanged +; CHECK-NOT: asm +; CHECK-NOT: llvm.amdgcn.module.lds +; CHECK-NOT: llvm.amdgcn.module.lds.t + +; var1, var2 would be transformed were they used from a non-kernel function +; CHECK: @var1 = addrspace(3) global i32 undef +; CHECK: @var2 = addrspace(3) global float undef +@var1 = addrspace(3) global i32 undef +@var2 = addrspace(3) global float undef + +; constant variables are left to the optimizer / error diagnostics +; CHECK: @const_undef = addrspace(3) constant i32 undef +; CHECK: @const_with_init = addrspace(3) constant i64 8 +@const_undef = addrspace(3) constant i32 undef +@const_with_init = addrspace(3) constant i64 8 + +; External and constant are both left to the optimizer / error diagnostics +; CHECK: @extern = external addrspace(3) global i32 +@extern = external addrspace(3) global i32 + +; Use of an addrspace(3) variable with an initializer is skipped, +; so as to preserve the unimplemented error from llc +; CHECK: @with_init = addrspace(3) global i64 0 +@with_init = addrspace(3) global i64 0 + +; Only local addrspace variables are transformed +; CHECK: @addr4 = addrspace(4) global i64 undef +@addr4 = addrspace(4) global i64 undef + +; Assign to self is treated as any other initializer, i.e. ignored by this pass +; CHECK: @toself = addrspace(3) global float addrspace(3)* bitcast (float addrspace(3)* addrspace(3)* @toself to float addrspace(3)*), align 8 +@toself = addrspace(3) global float addrspace(3)* bitcast (float addrspace(3)* addrspace(3)* @toself to float addrspace(3)*), align 8 + +; Use by .used lists doesn't trigger lowering +; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +@llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" + +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +@llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" + +; Access from a function would cause lowering for non-excluded cases +; CHECK-LABEL: @use_variables() +; CHECK: %c0 = load i32, i32 addrspace(3)* @const_undef, align 4 +; CHECK: %c1 = load i64, i64 addrspace(3)* @const_with_init, align 4 +; CHECK: %v0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 seq_cst +; CHECK: %v1 = cmpxchg i32 addrspace(3)* @extern, i32 4, i32 %c0 acq_rel monotonic +; CHECK: %v2 = atomicrmw add i64 addrspace(4)* @addr4, i64 %c1 monotonic +define void @use_variables() { + %c0 = load i32, i32 addrspace(3)* @const_undef, align 4 + %c1 = load i64, i64 addrspace(3)* @const_with_init, align 4 + %v0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 seq_cst + %v1 = cmpxchg i32 addrspace(3)* @extern, i32 4, i32 %c0 acq_rel monotonic + %v2 = atomicrmw add i64 addrspace(4)* @addr4, i64 %c1 monotonic + ret void +} + +; Use by kernel doesn't trigger lowering +; CHECK-LABEL: @kern_use() +; CHECK: %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic +define amdgpu_kernel void @kern_use() { + %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic + call void @use_variables() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll @@ -0,0 +1,39 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; CHECK: %llvm.amdgcn.module.lds.t = type { double, float } + +; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to float*), align 8 + +; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* null to double*), align 8 + +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8 + +@function_target = addrspace(3) global float undef, align 4 +@function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* @function_target to float*), align 8 + +@kernel_target = addrspace(3) global double undef, align 8 +@kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* @kernel_target to double*), align 8 + +; CHECK-LABEL: @function(float %x) +; CHECK: %0 = load float*, float* addrspace(1)* @function_indirect, align 8 +define void @function(float %x) local_unnamed_addr #5 { +entry: + %0 = load float*, float* addrspace(1)* @function_indirect, align 8 + store float %x, float* %0, align 4 + ret void +} + +; CHECK-LABEL: @kernel(double %x) +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: %0 = load double*, double* addrspace(1)* @kernel_indirect, align 8 +define amdgpu_kernel void @kernel(double %x) local_unnamed_addr #5 { +entry: + %0 = load double*, double* addrspace(1)* @kernel_indirect, align 8 + store double %x, double* %0, align 8 + ret void +} + + + + diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll @@ -0,0 +1,37 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; Check new struct is added to compiler.used and that the replaced variable is removed + +; CHECK: %llvm.amdgcn.module.lds.t = type { float } +; CHECK: @ignored = addrspace(1) global i64 0 +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8 + +; CHECK-NOT: @tolower + +@tolower = addrspace(3) global float undef, align 8 + +; A variable that is unchanged by pass +@ignored = addrspace(1) global i64 0 + + +; @ignored still in list, @tolower removed, llvm.amdgcn.module.lds appended +; Start with one value to replace and one to ignore in the .use list + +; @ignored still in list, @tolower removed +; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata" + +@llvm.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata" + +; @ignored still in list, @tolower removed, llvm.amdgcn.module.lds appended +; CHECK: @llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" + +@llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata" + +; CHECK-LABEL: @func() +; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0 +define void @func() { + %dec = atomicrmw fsub float addrspace(3)* @tolower, float 1.0 monotonic + %unused0 = atomicrmw add i64 addrspace(1)* @ignored, i64 1 monotonic + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll @@ -0,0 +1,56 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; Padding to meet alignment, so references to @var1 replaced with gep ptr, 0, 2 +; No i64 as addrspace(3) types with initializers are ignored. Likewise no addrspace(4). +; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8], i32 } + +; Variables removed by pass +; CHECK-NOT: @var0 +; CHECK-NOT: @var1 + +@var0 = addrspace(3) global float undef, align 8 +@var1 = addrspace(3) global i32 undef, align 8 + +@ptr = addrspace(1) global i32 addrspace(3)* @var1, align 4 + +; A variable that is unchanged by pass +; CHECK: @with_init = addrspace(3) global i64 0 +@with_init = addrspace(3) global i64 0 + +; Instance of new type, aligned to max of element alignment +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8 + +; Use in func rewritten to access struct at address zero, which prints as null +; CHECK-LABEL: @func() +; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0 +; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4 +; CHECK: %val1 = add i32 %val0, 4 +; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4 +; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic +define void @func() { + %dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic + %val0 = load i32, i32 addrspace(3)* @var1, align 4 + %val1 = add i32 %val0, 4 + store i32 %val1, i32 addrspace(3)* @var1, align 4 + %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic + ret void +} + +; This kernel calls a function that uses LDS so needs the block +; CHECK-LABEL: @kern_call() +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: call void @func() +; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 2.0 +define amdgpu_kernel void @kern_call() { + call void @func() + %dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic + ret void +} + +; This kernel does not need to alloc the LDS block as it makes no calls +; CHECK-LABEL: @kern_empty() +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +define spir_kernel void @kern_empty() { + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll @@ -1,5 +1,5 @@ ; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s -; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=ASM %s +; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-module-lds=true < %s | FileCheck -check-prefix=ASM %s target datalayout = "A5"