diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -127,7 +127,7 @@ unsigned AS = GV.getAddressSpace(); if ((AS == AMDGPUAS::REGION_ADDRESS) || (AS == AMDGPUAS::LOCAL_ADDRESS && - (!AMDGPUTargetMachine::EnableLowerModuleLDS || !GV.hasInitializer()))) + (!AMDGPUTargetMachine::EnableLowerModuleLDS))) recursivelyVisitUsers(GV, FuncsToAlwaysInline); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2542,7 +2542,7 @@ // allocated ones. They all share the same offset. if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { // Adjust alignment for that dynamic shared memory array. - MFI->setDynLDSAlign(B.getDataLayout(), *cast(GV)); + MFI->setDynLDSAlign(MF.getFunction(), *cast(GV)); LLT S32 = LLT::scalar(32); auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -184,6 +184,18 @@ return AMDGPU::isKernel(F->getCallingConv()); } +bool isDynamicLDS(const GlobalVariable *GV) { + if (AMDGPU::isLDSVariableToLower(*GV)) { + if (!GV->hasInitializer()) { + // addrspace(3) without initializer implies cuda/hip extern __shared__ + // the semantics for such a variable appears to be that all extern + // __shared__ variables alias one another + return true; + } + } + return false; +} + class AMDGPULowerModuleLDS : public ModulePass { static void @@ -468,6 +480,44 @@ AMDGPUAS::CONSTANT_ADDRESS); } + void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder, + GlobalVariable *LookupTable, + GlobalVariable *GV, Use &U, + Value *OptionalIndex) { + // Table is a constant array of the same length as OrderedKernels + // It either contains 32 bit pointers or structs of 32 bit pointers, where + // the OptionalIndex argument is used to index into the struct if present. + LLVMContext &Ctx = M.getContext(); + Type *I32 = Type::getInt32Ty(Ctx); + auto *I = cast(U.getUser()); + + Value *tableKernelIndex = getTableLookupKernelIndex(M, I->getFunction()); + + if (auto *Phi = dyn_cast(I)) { + BasicBlock *BB = Phi->getIncomingBlock(U); + Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); + } else { + Builder.SetInsertPoint(I); + } + + SmallVector GEPIdx = { + ConstantInt::get(I32, 0), + tableKernelIndex, + }; + if (OptionalIndex) + GEPIdx.push_back(OptionalIndex); + + Value *Address = Builder.CreateInBoundsGEP( + LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName()); + + Value *loaded = Builder.CreateLoad(I32, Address); + + Value *replacement = + Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName()); + + U.set(replacement); + } + void replaceUsesInInstructionsWithTableLookup( Module &M, ArrayRef ModuleScopeVariables, GlobalVariable *LookupTable) { @@ -484,33 +534,8 @@ if (!I) continue; - Value *tableKernelIndex = - getTableLookupKernelIndex(M, I->getFunction()); - - // So if the phi uses this value multiple times, what does this look - // like? - if (auto *Phi = dyn_cast(I)) { - BasicBlock *BB = Phi->getIncomingBlock(U); - Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); - } else { - Builder.SetInsertPoint(I); - } - - Value *GEPIdx[3] = { - ConstantInt::get(I32, 0), - tableKernelIndex, - ConstantInt::get(I32, Index), - }; - - Value *Address = Builder.CreateInBoundsGEP( - LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName()); - - Value *loaded = Builder.CreateLoad(I32, Address); - - Value *replacement = - Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName()); - - U.set(replacement); + replaceUseWithTableLookup(M, Builder, LookupTable, GV, U, + ConstantInt::get(I32, Index)); } } } @@ -632,47 +657,52 @@ } static std::vector assignLDSKernelIDToEachKernel( - Module *M, DenseSet const &KernelsThatAllocateTableLDS) { + Module *M, DenseSet const &KernelsThatAllocateTableLDS, + DenseSet const &KernelsThatIndirectlyAllocateDynamicLDS) { // Associate kernels in the set with an arbirary but reproducible order and // annotate them with that order in metadata. This metadata is recognised by // the backend and lowered to a SGPR which can be read from using // amdgcn_lds_kernel_id. std::vector OrderedKernels; + if (!KernelsThatAllocateTableLDS.empty() || + !KernelsThatIndirectlyAllocateDynamicLDS.empty()) { - for (Function &Func : M->functions()) { - if (Func.isDeclaration()) - continue; - if (!isKernelLDS(&Func)) - continue; + for (Function &Func : M->functions()) { + if (Func.isDeclaration()) + continue; + if (!isKernelLDS(&Func)) + continue; - if (KernelsThatAllocateTableLDS.contains(&Func)) { - assert(Func.hasName()); // else fatal error earlier - OrderedKernels.push_back(&Func); + if (KernelsThatAllocateTableLDS.contains(&Func) || + KernelsThatIndirectlyAllocateDynamicLDS.contains(&Func)) { + assert(Func.hasName()); // else fatal error earlier + OrderedKernels.push_back(&Func); + } } - } - // Put them in an arbitrary but reproducible order - llvm::sort(OrderedKernels.begin(), OrderedKernels.end(), - [](const Function *lhs, const Function *rhs) -> bool { - return lhs->getName() < rhs->getName(); - }); + // Put them in an arbitrary but reproducible order + llvm::sort(OrderedKernels.begin(), OrderedKernels.end(), + [](const Function *lhs, const Function *rhs) -> bool { + return lhs->getName() < rhs->getName(); + }); - // Annotate the kernels with their order in this vector - LLVMContext &Ctx = M->getContext(); - IRBuilder<> Builder(Ctx); + // Annotate the kernels with their order in this vector + LLVMContext &Ctx = M->getContext(); + IRBuilder<> Builder(Ctx); - if (OrderedKernels.size() > UINT32_MAX) { - // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU - report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels"); - } + if (OrderedKernels.size() > UINT32_MAX) { + // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU + report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels"); + } - for (size_t i = 0; i < OrderedKernels.size(); i++) { - Metadata *AttrMDArgs[1] = { - ConstantAsMetadata::get(Builder.getInt32(i)), - }; - OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id", - MDNode::get(Ctx, AttrMDArgs)); + for (size_t i = 0; i < OrderedKernels.size(); i++) { + Metadata *AttrMDArgs[1] = { + ConstantAsMetadata::get(Builder.getInt32(i)), + }; + OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id", + MDNode::get(Ctx, AttrMDArgs)); + } } return OrderedKernels; @@ -683,7 +713,8 @@ VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly, DenseSet &ModuleScopeVariables, DenseSet &TableLookupVariables, - DenseSet &KernelAccessVariables) { + DenseSet &KernelAccessVariables, + DenseSet &DynamicVariables) { GlobalVariable *HybridModuleRoot = LoweringKindLoc != LoweringKind::hybrid @@ -705,6 +736,11 @@ assert(AMDGPU::isLDSVariableToLower(*GV)); assert(K.second.size() != 0); + if (isDynamicLDS(GV)) { + DynamicVariables.insert(GV); + continue; + } + switch (LoweringKindLoc) { case LoweringKind::module: ModuleScopeVariables.insert(GV); @@ -743,7 +779,7 @@ // All LDS variables accessed indirectly have now been partitioned into // the distinct lowering strategies. assert(ModuleScopeVariables.size() + TableLookupVariables.size() + - KernelAccessVariables.size() == + KernelAccessVariables.size() + DynamicVariables.size() == LDSToKernelsThatNeedToAccessItIndirectly.size()); } @@ -813,7 +849,7 @@ markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV); } else { - Func.addFnAttr("amdgpu-elide-module-lds"); + markElideModuleLDS(Func); } } @@ -838,13 +874,17 @@ // Allocating variables that are used directly in this struct to get // alignment aware allocation and predictable frame size. for (auto &v : LDSUsesInfo.direct_access[&Func]) { - KernelUsedVariables.insert(v); + if (!isDynamicLDS(v)) { + KernelUsedVariables.insert(v); + } } // Allocating variables that are accessed indirectly so that a lookup of // this struct instance can find them from nested functions. for (auto &v : LDSUsesInfo.indirect_access[&Func]) { - KernelUsedVariables.insert(v); + if (!isDynamicLDS(v)) { + KernelUsedVariables.insert(v); + } } // Variables allocated in module lds must all resolve to that struct, @@ -857,6 +897,7 @@ if (KernelUsedVariables.empty()) { // Either used no LDS, or the LDS it used was all in the module struct + // or dyanmically sized continue; } @@ -876,33 +917,6 @@ auto Replacement = createLDSVariableReplacement(M, VarName, KernelUsedVariables); - // This struct is allocated at a predictable address that can be - // calculated now, recorded in metadata then used to lower references to - // it during codegen. - { - // frame layout, starting from 0 - //{ - // module.lds - // alignment padding - // kernel instance - //} - - if (!MaybeModuleScopeStruct || - Func.hasFnAttribute("amdgpu-elide-module-lds")) { - // There's no module.lds for this kernel so this replacement struct - // goes first - recordLDSAbsoluteAddress(&M, Replacement.SGV, 0); - } else { - const DataLayout &DL = M.getDataLayout(); - TypeSize ModuleSize = - DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType()); - GlobalVariable *KernelStruct = Replacement.SGV; - Align KernelAlign = AMDGPU::getAlign(DL, KernelStruct); - recordLDSAbsoluteAddress(&M, Replacement.SGV, - alignTo(ModuleSize, KernelAlign)); - } - } - // remove preserves existing codegen removeLocalVarsFromUsedLists(M, KernelUsedVariables); KernelToReplacement[&Func] = Replacement; @@ -917,6 +931,124 @@ return KernelToReplacement; } + static GlobalVariable * + buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo, + Function *func) { + // Create a dynamic lds variable with a name associated with the passed + // function that has the maximum alignment of any dynamic lds variable + // reachable from this kernel. Dynamic LDS is allocated after the static LDS + // allocation, possibly after alignment padding. The representative variable + // created here has the maximum alignment of any other dynamic variable + // reachable by that kernel. All dynamic LDS variables are allocated at the + // same address in each kernel in order to provide the documented aliasing + // semantics. Setting the alignment here allows this IR pass to accurately + // predict the exact constant at which it will be allocated. + + assert(isKernelLDS(func)); + + LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + Align MaxDynamicAlignment(1); + + auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) { + if (isDynamicLDS(GV)) { + MaxDynamicAlignment = + std::max(MaxDynamicAlignment, AMDGPU::getAlign(DL, GV)); + } + }; + + for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) { + UpdateMaxAlignment(GV); + } + + for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) { + UpdateMaxAlignment(GV); + } + + assert(func->hasName()); // Checked by caller + std::string VarName = + Twine("llvm.amdgcn." + func->getName() + ".dynlds").str(); + + auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0); + GlobalVariable *N = new GlobalVariable( + M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr, + VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, + false); + N->setAlignment(MaxDynamicAlignment); + + assert(isDynamicLDS(N)); + return N; + } + + DenseMap lowerDynamicLDSVariables( + Module &M, LDSUsesInfoTy &LDSUsesInfo, + DenseSet const &KernelsThatIndirectlyAllocateDynamicLDS, + DenseSet const &DynamicVariables, + std::vector const &OrderedKernels) { + DenseMap KernelToCreatedDynamicLDS; + if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) { + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + Type *I32 = Type::getInt32Ty(Ctx); + + std::vector newDynamicLDS; + + // Table is built in the same order as OrderedKernels + for (auto &func : OrderedKernels) { + + if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) { + assert(isKernelLDS(func)); + if (!func->hasName()) { + report_fatal_error("Anonymous kernels cannot use LDS variables"); + } + + GlobalVariable *N = + buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func); + + KernelToCreatedDynamicLDS[func] = N; + + markUsedByKernel(Builder, func, N); + + auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0); + auto GEP = ConstantExpr::getGetElementPtr( + emptyCharArray, N, ConstantInt::get(I32, 0), true); + newDynamicLDS.push_back(ConstantExpr::getPtrToInt(GEP, I32)); + } else { + newDynamicLDS.push_back(PoisonValue::get(I32)); + } + } + assert(OrderedKernels.size() == newDynamicLDS.size()); + + ArrayType *t = ArrayType::get(I32, newDynamicLDS.size()); + Constant *init = ConstantArray::get(t, newDynamicLDS); + GlobalVariable *table = new GlobalVariable( + M, t, true, GlobalValue::InternalLinkage, init, + "llvm.amdgcn.dynlds.offset.table", nullptr, + GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS); + + for (GlobalVariable *GV : DynamicVariables) { + for (Use &U : make_early_inc_range(GV->uses())) { + auto *I = dyn_cast(U.getUser()); + if (!I) + continue; + if (isKernelLDS(I->getFunction())) + continue; + + replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr); + } + } + } + return KernelToCreatedDynamicLDS; + } + + static bool canElideModuleLDS(const Function &F) { + return F.hasFnAttribute("amdgpu-elide-module-lds"); + } + + static void markElideModuleLDS(Function &F) { + F.addFnAttr("amdgpu-elide-module-lds"); + } + bool runOnModule(Module &M) override { CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); @@ -939,12 +1071,15 @@ } } + // Partition variables accessed indirectly into the different strategies DenseSet ModuleScopeVariables; DenseSet TableLookupVariables; DenseSet KernelAccessVariables; + DenseSet DynamicVariables; partitionVariablesIntoIndirectStrategies( M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly, - ModuleScopeVariables, TableLookupVariables, KernelAccessVariables); + ModuleScopeVariables, TableLookupVariables, KernelAccessVariables, + DynamicVariables); // If the kernel accesses a variable that is going to be stored in the // module instance through a call then that kernel needs to allocate the @@ -956,6 +1091,10 @@ kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, TableLookupVariables); + DenseSet KernelsThatIndirectlyAllocateDynamicLDS = + kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, + DynamicVariables); + GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables( M, ModuleScopeVariables, KernelsThatAllocateModuleLDS); @@ -981,14 +1120,15 @@ }); } + // The ith element of this vector is kernel id i + std::vector OrderedKernels = + assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS, + KernelsThatIndirectlyAllocateDynamicLDS); + if (!KernelsThatAllocateTableLDS.empty()) { LLVMContext &Ctx = M.getContext(); IRBuilder<> Builder(Ctx); - // The ith element of this vector is kernel id i - std::vector OrderedKernels = - assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS); - for (size_t i = 0; i < OrderedKernels.size(); i++) { markUsedByKernel(Builder, OrderedKernels[i], KernelToReplacement[OrderedKernels[i]].SGV); @@ -1010,6 +1150,70 @@ LookupTable); } + DenseMap KernelToCreatedDynamicLDS = + lowerDynamicLDSVariables(M, LDSUsesInfo, + KernelsThatIndirectlyAllocateDynamicLDS, + DynamicVariables, OrderedKernels); + + // All kernel frames have been allocated. Calculate and record the + // addresses. + + { + const DataLayout &DL = M.getDataLayout(); + + for (Function &Func : M.functions()) { + if (Func.isDeclaration() || !isKernelLDS(&Func)) + continue; + + // All three of these are optional. The first variable is allocated at + // zero. They are allocated by allocateKnownAddressLDSGlobal in the + // following order: + //{ + // module.lds + // alignment padding + // kernel instance + // alignment padding + // dynamic lds variables + //} + + const bool AllocateModuleScopeStruct = + MaybeModuleScopeStruct && !canElideModuleLDS(Func); + + const bool AllocateKernelScopeStruct = + KernelToReplacement.contains(&Func); + + const bool AllocateDynamicVariable = + KernelToCreatedDynamicLDS.contains(&Func); + + uint32_t Offset = 0; + + if (AllocateModuleScopeStruct) { + // Allocated at zero, recorded once on construction, not once per + // kernel + Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType()); + } + + if (AllocateKernelScopeStruct) { + GlobalVariable *KernelStruct = KernelToReplacement[&Func].SGV; + + Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct)); + + recordLDSAbsoluteAddress(&M, KernelStruct, Offset); + + Offset += DL.getTypeAllocSize(KernelStruct->getValueType()); + + } + + if (AllocateDynamicVariable) { + GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func]; + + Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable)); + + recordLDSAbsoluteAddress(&M, DynamicVariable, Offset); + } + } + } + for (auto &GV : make_early_inc_range(M.globals())) if (AMDGPU::isLDSVariableToLower(GV)) { // probably want to remove from used lists diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -111,7 +111,7 @@ Align getDynLDSAlign() const { return DynLDSAlign; } - void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV); + void setDynLDSAlign(const Function &F, const GlobalVariable &GV); }; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -99,6 +99,15 @@ return M->getNamedGlobal(KernelLDSName); } +static const GlobalVariable * +getKernelDynLDSGlobalFromFunction(const Function &F) { + const Module *M = F.getParent(); + std::string KernelDynLDSName = "llvm.amdgcn."; + KernelDynLDSName += F.getName(); + KernelDynLDSName += ".dynlds"; + return M->getNamedGlobal(KernelDynLDSName); +} + // This kernel calls no functions that require the module lds struct static bool canElideModuleLDS(const Function &F) { return F.hasFnAttribute("amdgpu-elide-module-lds"); @@ -131,11 +140,12 @@ const GlobalVariable *GV = M->getNamedGlobal(ModuleLDSName); const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F); + const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F); if (GV && !canElideModuleLDS(F)) { unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align()); std::optional Expect = getLDSAbsoluteAddress(*GV); - if (!Expect || (Offset != Expect)) { + if (!Expect || (Offset != *Expect)) { report_fatal_error("Inconsistent metadata on module LDS variable"); } } @@ -145,10 +155,23 @@ // before any other non-module LDS variables. unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *KV, Align()); std::optional Expect = getLDSAbsoluteAddress(*KV); - if (!Expect || (Offset != Expect)) { + if (!Expect || (Offset != *Expect)) { report_fatal_error("Inconsistent metadata on kernel LDS variable"); } } + + if (Dyn) { + // The dynamic LDS is deterministic because the per-kernel one has the + // maximum alignment of any reachable and all remaining LDS variables, + // if this is present, are themselves dynamic LDS and will be allocated + // at the same address. + setDynLDSAlign(F, *Dyn); + unsigned Offset = LDSSize; + std::optional Expect = getLDSAbsoluteAddress(*Dyn); + if (!Expect || (Offset != *Expect)) { + report_fatal_error("Inconsistent metadata on dynamic LDS variable"); + } + } } } @@ -187,8 +210,10 @@ return {}; } -void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL, +void AMDGPUMachineFunction::setDynLDSAlign(const Function &F, const GlobalVariable &GV) { + const Module *M = F.getParent(); + const DataLayout &DL = M->getDataLayout(); assert(DL.getTypeAllocSize(GV.getValueType()).isZero()); Align Alignment = @@ -198,4 +223,17 @@ LDSSize = alignTo(StaticLDSSize, Alignment); DynLDSAlign = Alignment; + + // If there is a dynamic LDS variable associated with this function F, every + // further dynamic LDS instance (allocated by calling setDynLDSAlign) must + // map to the same address. This holds because no LDS is allocated after the + // lowering pass if there are dynamic LDS variables present. + const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F); + if (Dyn) { + unsigned Offset = LDSSize; // return this? + std::optional Expect = getLDSAbsoluteAddress(*Dyn); + if (!Expect || (Offset != *Expect)) { + report_fatal_error("Inconsistent metadata on dynamic LDS variable"); + } + } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6139,7 +6139,8 @@ if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) { assert(PtrVT == MVT::i32 && "32-bit pointer is expected."); // Adjust alignment for that dynamic shared memory array. - MFI->setDynLDSAlign(DAG.getDataLayout(), *cast(GV)); + Function &F = DAG.getMachineFunction().getFunction(); + MFI->setDynLDSAlign(F, *cast(GV)); return SDValue( DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -79,24 +79,23 @@ if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { return false; } + if (GV.isConstant()) { + // A constant undef variable can't be written to, and any load is + // undef, so it should be eliminated by the optimizer. It could be + // dropped by the back end if not. This pass skips over it. + return false; + } if (!GV.hasInitializer()) { // addrspace(3) without initializer implies cuda/hip extern __shared__ // the semantics for such a variable appears to be that all extern - // __shared__ variables alias one another, in which case this transform - // is not required - return false; + // __shared__ variables alias one another. This hits different handling. + return true; } if (!isa(GV.getInitializer())) { // Initializers are unimplemented for LDS address space. // Leave such variables in place for consistent error reporting. return false; } - if (GV.isConstant()) { - // A constant undef variable can't be written to, and any load is - // undef, so it should be eliminated by the optimizer. It could be - // dropped by the back end if not. This pass skips over it. - return false; - } return true; } diff --git a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll --- a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll +++ b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll @@ -135,25 +135,4 @@ ret void } -; CHECK-LABEL: dynamic_shared_array_with_call: -; CHECK-NOT: s_swappc_b64 -define amdgpu_kernel void @dynamic_shared_array_with_call(ptr addrspace(1) nocapture readnone %out) local_unnamed_addr { - %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() - %1 = sext i32 %tid.x to i64 - %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds0, i64 0, i64 %1 - %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 - tail call void @store_value(float %val0) - ret void -} - -; CHECK-NOT: store_value -define linkonce_odr hidden void @store_value(float %val1) local_unnamed_addr { -entry: - %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() - %0 = sext i32 %tid.x to i64 - %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i64 0, i64 %0 - store float %val1, ptr addrspace(3) %arrayidx1, align 4 - ret void -} - declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -34,7 +34,64 @@ @extern_normal = external addrspace(3) global [0 x float] @extern_overalign = external addrspace(3) global [0 x float], align 8 -; 2^3 cases encoded into function names + +; External LDS does not influence the frame when called indirectly either +define void @use_extern_normal() #0 { +; CHECK-LABEL: use_extern_normal: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, llvm.amdgcn.dynlds.offset.table@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.dynlds.offset.table@rel32@hi+12 +; CHECK-NEXT: s_mov_b32 s4, s15 +; CHECK-NEXT: s_ashr_i32 s5, s15, 31 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x4048f5c3 +; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; CHECK-NEXT: s_add_u32 s4, s4, s6 +; CHECK-NEXT: s_addc_u32 s5, s5, s7 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: ds_write_b32 v1, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %arrayidx = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 0 + store float 0x40091EB860000000, ptr addrspace(3) %arrayidx + ret void +} + +define void @use_extern_overalign() #0 { +; CHECK-LABEL: use_extern_overalign: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, llvm.amdgcn.dynlds.offset.table@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.dynlds.offset.table@rel32@hi+12 +; CHECK-NEXT: s_mov_b32 s4, s15 +; CHECK-NEXT: s_ashr_i32 s5, s15, 31 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x42280000 +; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; CHECK-NEXT: s_add_u32 s4, s4, s6 +; CHECK-NEXT: s_addc_u32 s5, s5, s7 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: ds_write_b32 v1, v0 offset:4 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %arrayidx = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 1 + store float 4.200000e+01, ptr addrspace(3) %arrayidx + ret void +} + + +; First 2^3 of 2^4 cases encoded into function names +; no use of extern variable from nested function +; module_variable used/not-used +; kernel variable normal/overaligned +; extern variable normal/overaligned define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) #1 { ; CHECK-LABEL: module_0_kernel_normal_extern_normal: @@ -268,5 +325,281 @@ ret void } + +;; Second 2^3 of 2^4 cases encoded into function names +; with extern variable from nested function +; module_variable used/not-used +; kernel variable normal/overaligned +; extern variable normal/overaligned + +define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) #1 { +; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s8, s8, s11 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; CHECK-NEXT: s_add_u32 s0, s0, s11 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b32 s15, 0 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[8:9] +; CHECK-NEXT: s_endpgm + store i16 2, ptr addrspace(3) @kernel_normal + + call void @use_extern_normal() + ret void +} + +define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) { +; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s8, s8, s11 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; CHECK-NEXT: s_add_u32 s0, s0, s11 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s15, 4 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: ds_write_b16 v0, v2 offset:2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_endpgm + call void @use_module() + store i16 1, ptr addrspace(3) @module_variable + + store i16 2, ptr addrspace(3) @kernel_normal + + call void @use_extern_normal() + ret void +} + +define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) #1 { +; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s8, s8, s11 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; CHECK-NEXT: s_add_u32 s0, s0, s11 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b32 s15, 2 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[8:9] +; CHECK-NEXT: s_endpgm + store i16 2, ptr addrspace(3) @kernel_overalign + + call void @use_extern_normal() + ret void +} + +define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) { +; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s8, s8, s11 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; CHECK-NEXT: s_add_u32 s0, s0, s11 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s15, 6 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: ds_write_b16 v0, v2 offset:4 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_endpgm + call void @use_module() + store i16 1, ptr addrspace(3) @module_variable + + store i16 2, ptr addrspace(3) @kernel_overalign + + call void @use_extern_normal() + ret void +} + +define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) #1 { +; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s8, s8, s11 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; CHECK-NEXT: s_add_u32 s0, s0, s11 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b32 s15, 1 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[8:9] +; CHECK-NEXT: s_endpgm + store i16 2, ptr addrspace(3) @kernel_normal + + call void @use_extern_overalign() + ret void +} + +define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) { +; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s8, s8, s11 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; CHECK-NEXT: s_add_u32 s0, s0, s11 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s15, 5 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: ds_write_b16 v0, v2 offset:2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_endpgm + call void @use_module() + store i16 1, ptr addrspace(3) @module_variable + + store i16 2, ptr addrspace(3) @kernel_normal + + call void @use_extern_overalign() + ret void +} + +define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) #1 { +; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s8, s8, s11 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; CHECK-NEXT: s_add_u32 s0, s0, s11 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b32 s15, 3 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[8:9] +; CHECK-NEXT: s_endpgm + store i16 2, ptr addrspace(3) @kernel_overalign + + call void @use_extern_overalign() + ret void +} + +define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) { +; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s8, s8, s11 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; CHECK-NEXT: s_add_u32 s0, s0, s11 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s15, 7 +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: ds_write_b16 v0, v2 offset:4 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_endpgm + call void @use_module() + store i16 1, ptr addrspace(3) @module_variable + + store i16 2, ptr addrspace(3) @kernel_overalign + + call void @use_extern_overalign() + ret void +} + + attributes #0 = { noinline } attributes #1 = { "amdgpu-elide-module-lds" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll @@ -18,10 +18,6 @@ @const_undef = addrspace(3) constant i32 undef @const_with_init = addrspace(3) constant i64 8 -; External and constant are both left to the optimizer / error diagnostics -; CHECK: @extern = external addrspace(3) global i32 -@extern = external addrspace(3) global i32 - ; Use of an addrspace(3) variable with an initializer is skipped, ; so as to preserve the unimplemented error from llc ; CHECK: @with_init = addrspace(3) global i64 0 @@ -47,14 +43,12 @@ ; CHECK: %c0 = load i32, ptr addrspace(3) @const_undef, align 4 ; CHECK: %c1 = load i64, ptr addrspace(3) @const_with_init, align 4 ; CHECK: %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst -; CHECK: %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic -; CHECK: %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic +; CHECK: %v1 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic define void @use_variables() { %c0 = load i32, ptr addrspace(3) @const_undef, align 4 %c1 = load i64, ptr addrspace(3) @const_with_init, align 4 %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst - %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic - %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic + %v1 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic ret void } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + + +; Not reached by a non-kernel function and therefore not changed by this pass +@dynamic_kernel_only = external addrspace(3) global [0 x double] + +; shared8 is used directly by a kernel so remains in the outbout +; the other three are only used by functions and will be replaced with +; table lookups and dropped from the IR +@dynamic_shared1 = external addrspace(3) global [0 x i8], align 1 +@dynamic_shared2 = external addrspace(3) global [0 x i16], align 2 +@dynamic_shared4 = external addrspace(3) global [0 x i32], align 4 +@dynamic_shared8 = external addrspace(3) global [0 x i64], align 8 + +; CHECK: %llvm.amdgcn.module.lds.t = type { i32 } +; CHECK: @dynamic_kernel_only = external addrspace(3) global [0 x double] +; CHECK: @dynamic_shared8 = external addrspace(3) global [0 x i64], align 8 +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4, !absolute_symbol !0 +; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" + +; Alignment of these must be the maximum of the alignment of the reachable symbols +; CHECK: @llvm.amdgcn.expect_align1.dynlds = external addrspace(3) global [0 x i8], align 1, !absolute_symbol !0 +; CHECK: @llvm.amdgcn.expect_align2.dynlds = external addrspace(3) global [0 x i8], align 2, !absolute_symbol !0 +; CHECK: @llvm.amdgcn.expect_align4.dynlds = external addrspace(3) global [0 x i8], align 4, !absolute_symbol !1 +; CHECK: @llvm.amdgcn.expect_align8.dynlds = external addrspace(3) global [0 x i8], align 8, !absolute_symbol !0 + +; Align 4 and symbol at address [4 5) as module.lds is reachable +; CHECK: @llvm.amdgcn.expect_max_of_2_and_4.dynlds = external addrspace(3) global [0 x i8], align 4, !absolute_symbol !1 + +; Builds a lookup table out of the newly created (suffixed .dynlds) variables in kernel.id order +; CHECK: @llvm.amdgcn.dynlds.offset.table = internal addrspace(4) constant [5 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align1.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align2.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align4.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_max_of_2_and_4.dynlds to i32)] + + + +define amdgpu_kernel void @kernel_only() { +; CHECK-LABEL: @kernel_only() #0 { +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_kernel_only, i32 0, i32 0 +; CHECK-NEXT: store double 3.140000e+00, ptr addrspace(3) [[ARRAYIDX]], align 8 +; CHECK-NEXT: ret void +; + %arrayidx = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_kernel_only, i32 0, i32 0 + store double 3.140000e+00, ptr addrspace(3) %arrayidx + ret void +} + +; The accesses from functions are rewritten to go through the llvm.amdgcn.dynlds.offset.table +define void @use_shared1() #0 { +; CHECK-LABEL: @use_shared1() #1 { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[DYNAMIC_SHARED1:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED1]], align 4 +; CHECK-NEXT: [[DYNAMIC_SHARED11:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) [[DYNAMIC_SHARED11]], i32 0, i32 1 +; CHECK-NEXT: store i8 0, ptr addrspace(3) [[ARRAYIDX]], align 1 +; CHECK-NEXT: ret void +; + %arrayidx = getelementptr inbounds [0 x i8], ptr addrspace(3) @dynamic_shared1, i32 0, i32 1 + store i8 0, ptr addrspace(3) %arrayidx + ret void +} + +define void @use_shared2() #0 { +; CHECK-LABEL: @use_shared2() #1 { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[DYNAMIC_SHARED2:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED2]], align 4 +; CHECK-NEXT: [[DYNAMIC_SHARED21:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i16], ptr addrspace(3) [[DYNAMIC_SHARED21]], i32 0, i32 3 +; CHECK-NEXT: store i16 1, ptr addrspace(3) [[ARRAYIDX]], align 2 +; CHECK-NEXT: ret void +; + %arrayidx = getelementptr inbounds [0 x i16], ptr addrspace(3) @dynamic_shared2, i32 0, i32 3 + store i16 1, ptr addrspace(3) %arrayidx + ret void +} + +; Include a normal variable so that the new variables aren't all at the same absolute_symbol +@static_shared = addrspace(3) global i32 undef +define void @use_shared4() #0 { +; CHECK-LABEL: @use_shared4() #1 { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: store i32 4, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 +; CHECK-NEXT: [[DYNAMIC_SHARED4:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED4]], align 4 +; CHECK-NEXT: [[DYNAMIC_SHARED41:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr addrspace(3) [[DYNAMIC_SHARED41]], i32 0, i32 5 +; CHECK-NEXT: store i32 2, ptr addrspace(3) [[ARRAYIDX]], align 4 +; CHECK-NEXT: ret void +; + store i32 4, ptr addrspace(3) @static_shared + %arrayidx = getelementptr inbounds [0 x i32], ptr addrspace(3) @dynamic_shared4, i32 0, i32 5 + store i32 2, ptr addrspace(3) %arrayidx + ret void +} + +define void @use_shared8() #0 { +; CHECK-LABEL: @use_shared8() #1 { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[DYNAMIC_SHARED8:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED8]], align 4 +; CHECK-NEXT: [[DYNAMIC_SHARED81:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) [[DYNAMIC_SHARED81]], i32 0, i32 7 +; CHECK-NEXT: store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4 +; CHECK-NEXT: ret void +; + %arrayidx = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 7 + store i64 3, ptr addrspace(3) %arrayidx + ret void +} + +; The kernels are annotated with kernel.id and llvm.donothing use of the corresponding variable +define amdgpu_kernel void @expect_align1() { +; CHECK-LABEL: @expect_align1() #0 !llvm.amdgcn.lds.kernel.id !2 +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align1.dynlds) ] +; CHECK-NEXT: call void @use_shared1() +; CHECK-NEXT: ret void +; + call void @use_shared1() + ret void +} + +define amdgpu_kernel void @expect_align2() { +; CHECK-LABEL: @expect_align2() #0 !llvm.amdgcn.lds.kernel.id !3 +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align2.dynlds) ] +; CHECK-NEXT: call void @use_shared2() +; CHECK-NEXT: ret void +; + call void @use_shared2() + ret void +} + +define amdgpu_kernel void @expect_align4() { +; CHECK-LABEL: @expect_align4() !llvm.amdgcn.lds.kernel.id !4 { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align4.dynlds) ] +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: call void @use_shared4() +; CHECK-NEXT: ret void +; + call void @use_shared4() + ret void +} + +; Use dynamic_shared directly too. Can elide module lds (#0) +define amdgpu_kernel void @expect_align8() { +; CHECK-LABEL: @expect_align8() #0 !llvm.amdgcn.lds.kernel.id !5 { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds) ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 9 +; CHECK-NEXT: store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4 +; CHECK-NEXT: call void @use_shared8() +; CHECK-NEXT: ret void +; + %arrayidx = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 9 + store i64 3, ptr addrspace(3) %arrayidx + call void @use_shared8() + ret void +} + +; Note: use_shared4 uses module.lds so this will allocate at offset 4 +define amdgpu_kernel void @expect_max_of_2_and_4() { +; CHECK-LABEL: @expect_max_of_2_and_4() !llvm.amdgcn.lds.kernel.id !6 { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_max_of_2_and_4.dynlds) ] +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: call void @use_shared2() +; CHECK-NEXT: call void @use_shared4() +; CHECK-NEXT: ret void +; + call void @use_shared2() + call void @use_shared4() + ret void +} + + +attributes #0 = { noinline } + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +; CHECK: declare void @llvm.donothing() #2 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +; CHECK: declare i32 @llvm.amdgcn.lds.kernel.id() #3 + +; CHECK: attributes #0 = { "amdgpu-elide-module-lds" } +; CHECK: attributes #1 = { noinline } +; CHECK: attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +; CHECK: !0 = !{i64 0, i64 1} +; CHECK: !1 = !{i64 4, i64 5} +; CHECK: !2 = !{i32 0} +; CHECK: !3 = !{i32 1} +; CHECK: !4 = !{i32 2} +; CHECK: !5 = !{i32 3} +; CHECK: !6 = !{i32 4}