diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2542,7 +2542,7 @@ // allocated ones. They all share the same offset. if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { // Adjust alignment for that dynamic shared memory array. - MFI->setDynLDSAlign(B.getDataLayout(), *cast(GV)); + MFI->setDynLDSAlign(MF.getFunction(), *cast(GV)); LLT S32 = LLT::scalar(32); auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -184,6 +184,18 @@ return AMDGPU::isKernel(F->getCallingConv()); } +bool isDynamicLDS(const GlobalVariable *GV) { + if (AMDGPU::isLDSVariableToLower(*GV)) { + if (!GV->hasInitializer()) { + // addrspace(3) without initializer implies cuda/hip extern __shared__ + // the semantics for such a variable appears to be that all extern + // __shared__ variables alias one another + return true; + } + } + return false; +} + class AMDGPULowerModuleLDS : public ModulePass { static void @@ -468,6 +480,44 @@ AMDGPUAS::CONSTANT_ADDRESS); } + void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder, + GlobalVariable *LookupTable, + GlobalVariable *GV, Use &U, + Value *OptionalIndex) { + // Table is a constant array of the same length as OrderedKernels + // It either contains 32 bit pointers or structs of 32 bit pointers, where + // the OptionalIndex argument is used to index into the struct if present. + LLVMContext &Ctx = M.getContext(); + Type *I32 = Type::getInt32Ty(Ctx); + auto *I = cast(U.getUser()); + + Value *tableKernelIndex = getTableLookupKernelIndex(M, I->getFunction()); + + if (auto *Phi = dyn_cast(I)) { + BasicBlock *BB = Phi->getIncomingBlock(U); + Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); + } else { + Builder.SetInsertPoint(I); + } + + SmallVector GEPIdx = { + ConstantInt::get(I32, 0), + tableKernelIndex, + }; + if (OptionalIndex) + GEPIdx.push_back(OptionalIndex); + + Value *Address = Builder.CreateInBoundsGEP( + LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName()); + + Value *loaded = Builder.CreateLoad(I32, Address); + + Value *replacement = + Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName()); + + U.set(replacement); + } + void replaceUsesInInstructionsWithTableLookup( Module &M, ArrayRef ModuleScopeVariables, GlobalVariable *LookupTable) { @@ -484,33 +534,8 @@ if (!I) continue; - Value *tableKernelIndex = - getTableLookupKernelIndex(M, I->getFunction()); - - // So if the phi uses this value multiple times, what does this look - // like? - if (auto *Phi = dyn_cast(I)) { - BasicBlock *BB = Phi->getIncomingBlock(U); - Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); - } else { - Builder.SetInsertPoint(I); - } - - Value *GEPIdx[3] = { - ConstantInt::get(I32, 0), - tableKernelIndex, - ConstantInt::get(I32, Index), - }; - - Value *Address = Builder.CreateInBoundsGEP( - LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName()); - - Value *loaded = Builder.CreateLoad(I32, Address); - - Value *replacement = - Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName()); - - U.set(replacement); + replaceUseWithTableLookup(M, Builder, LookupTable, GV, U, + ConstantInt::get(I32, Index)); } } } @@ -632,47 +657,52 @@ } static std::vector assignLDSKernelIDToEachKernel( - Module *M, DenseSet const &KernelsThatAllocateTableLDS) { + Module *M, DenseSet const &KernelsThatAllocateTableLDS, + DenseSet const &KernelsThatIndirectlyAllocateDynamicLDS) { // Associate kernels in the set with an arbirary but reproducible order and // annotate them with that order in metadata. This metadata is recognised by // the backend and lowered to a SGPR which can be read from using // amdgcn_lds_kernel_id. std::vector OrderedKernels; + if (!KernelsThatAllocateTableLDS.empty() || + !KernelsThatIndirectlyAllocateDynamicLDS.empty()) { - for (Function &Func : M->functions()) { - if (Func.isDeclaration()) - continue; - if (!isKernelLDS(&Func)) - continue; + for (Function &Func : M->functions()) { + if (Func.isDeclaration()) + continue; + if (!isKernelLDS(&Func)) + continue; - if (KernelsThatAllocateTableLDS.contains(&Func)) { - assert(Func.hasName()); // else fatal error earlier - OrderedKernels.push_back(&Func); + if (KernelsThatAllocateTableLDS.contains(&Func) || + KernelsThatIndirectlyAllocateDynamicLDS.contains(&Func)) { + assert(Func.hasName()); // else fatal error earlier + OrderedKernels.push_back(&Func); + } } - } - // Put them in an arbitrary but reproducible order - llvm::sort(OrderedKernels.begin(), OrderedKernels.end(), - [](const Function *lhs, const Function *rhs) -> bool { - return lhs->getName() < rhs->getName(); - }); + // Put them in an arbitrary but reproducible order + llvm::sort(OrderedKernels.begin(), OrderedKernels.end(), + [](const Function *lhs, const Function *rhs) -> bool { + return lhs->getName() < rhs->getName(); + }); - // Annotate the kernels with their order in this vector - LLVMContext &Ctx = M->getContext(); - IRBuilder<> Builder(Ctx); + // Annotate the kernels with their order in this vector + LLVMContext &Ctx = M->getContext(); + IRBuilder<> Builder(Ctx); - if (OrderedKernels.size() > UINT32_MAX) { - // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU - report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels"); - } + if (OrderedKernels.size() > UINT32_MAX) { + // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU + report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels"); + } - for (size_t i = 0; i < OrderedKernels.size(); i++) { - Metadata *AttrMDArgs[1] = { - ConstantAsMetadata::get(Builder.getInt32(i)), - }; - OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id", - MDNode::get(Ctx, AttrMDArgs)); + for (size_t i = 0; i < OrderedKernels.size(); i++) { + Metadata *AttrMDArgs[1] = { + ConstantAsMetadata::get(Builder.getInt32(i)), + }; + OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id", + MDNode::get(Ctx, AttrMDArgs)); + } } return OrderedKernels; @@ -683,7 +713,8 @@ VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly, DenseSet &ModuleScopeVariables, DenseSet &TableLookupVariables, - DenseSet &KernelAccessVariables) { + DenseSet &KernelAccessVariables, + DenseSet &DynamicVariables) { GlobalVariable *HybridModuleRoot = LoweringKindLoc != LoweringKind::hybrid @@ -705,6 +736,11 @@ assert(AMDGPU::isLDSVariableToLower(*GV)); assert(K.second.size() != 0); + if (isDynamicLDS(GV)) { + DynamicVariables.insert(GV); + continue; + } + switch (LoweringKindLoc) { case LoweringKind::module: ModuleScopeVariables.insert(GV); @@ -743,7 +779,7 @@ // All LDS variables accessed indirectly have now been partitioned into // the distinct lowering strategies. assert(ModuleScopeVariables.size() + TableLookupVariables.size() + - KernelAccessVariables.size() == + KernelAccessVariables.size() + DynamicVariables.size() == LDSToKernelsThatNeedToAccessItIndirectly.size()); } @@ -813,7 +849,7 @@ markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV); } else { - Func.addFnAttr("amdgpu-elide-module-lds"); + markElideModuleLDS(Func); } } @@ -838,12 +874,15 @@ // Allocating variables that are used directly in this struct to get // alignment aware allocation and predictable frame size. for (auto &v : LDSUsesInfo.direct_access[&Func]) { - KernelUsedVariables.insert(v); + if (!isDynamicLDS(v)) { + KernelUsedVariables.insert(v); + } } // Allocating variables that are accessed indirectly so that a lookup of // this struct instance can find them from nested functions. for (auto &v : LDSUsesInfo.indirect_access[&Func]) { + assert(!isDynamicLDS(v)); KernelUsedVariables.insert(v); } @@ -857,6 +896,7 @@ if (KernelUsedVariables.empty()) { // Either used no LDS, or the LDS it used was all in the module struct + // or dyanmically sized continue; } @@ -876,33 +916,6 @@ auto Replacement = createLDSVariableReplacement(M, VarName, KernelUsedVariables); - // This struct is allocated at a predictable address that can be - // calculated now, recorded in metadata then used to lower references to - // it during codegen. - { - // frame layout, starting from 0 - //{ - // module.lds - // alignment padding - // kernel instance - //} - - if (!MaybeModuleScopeStruct || - Func.hasFnAttribute("amdgpu-elide-module-lds")) { - // There's no module.lds for this kernel so this replacement struct - // goes first - recordLDSAbsoluteAddress(&M, Replacement.SGV, 0); - } else { - const DataLayout &DL = M.getDataLayout(); - TypeSize ModuleSize = - DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType()); - GlobalVariable *KernelStruct = Replacement.SGV; - Align KernelAlign = AMDGPU::getAlign(DL, KernelStruct); - recordLDSAbsoluteAddress(&M, Replacement.SGV, - alignTo(ModuleSize, KernelAlign)); - } - } - // remove preserves existing codegen removeLocalVarsFromUsedLists(M, KernelUsedVariables); KernelToReplacement[&Func] = Replacement; @@ -917,6 +930,124 @@ return KernelToReplacement; } + static GlobalVariable * + buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo, + Function *func) { + // Create a dynamic lds variable with a name associated with the passed + // function that has the maximum alignment of any dynamic lds variable + // reachable from this kernel. Dynamic LDS is allocated after the static LDS + // allocation, possibly after alignment padding. The representative variable + // created here has the maximum alignment of any other dynamic variable + // reachable by that kernel. All dynamic LDS variables are allocated at the + // same address in each kernel in order to provide the documented aliasing + // semantics. Setting the alignment here allows this IR pass to accurately + // predict the exact constant at which it will be allocated. + + assert(isKernelLDS(func)); + + LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + Align MaxDynamicAlignment(1); + + auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) { + if (!isDynamicLDS(GV)) { + MaxDynamicAlignment = + std::max(MaxDynamicAlignment, AMDGPU::getAlign(DL, GV)); + } + }; + + for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) { + UpdateMaxAlignment(GV); + } + + for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) { + UpdateMaxAlignment(GV); + } + + assert(func->hasName()); // Checked by caller + std::string VarName = + Twine("llvm.amdgcn." + func->getName() + ".dynlds").str(); + + auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0); + GlobalVariable *N = new GlobalVariable( + M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr, + VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, + false); + N->setAlignment(MaxDynamicAlignment); + + assert(isDynamicLDS(N)); + return N; + } + + DenseMap lowerDynamicLDSVariables( + Module &M, LDSUsesInfoTy &LDSUsesInfo, + DenseSet const &KernelsThatIndirectlyAllocateDynamicLDS, + DenseSet const &DynamicVariables, + std::vector const &OrderedKernels) { + DenseMap KernelToCreatedDynamicLDS; + if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) { + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + Type *I32 = Type::getInt32Ty(Ctx); + + std::vector newDynamicLDS; + + // Table is built in the same order as OrderedKernels + for (auto &func : OrderedKernels) { + + if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) { + assert(isKernelLDS(func)); + if (!func->hasName()) { + report_fatal_error("Anonymous kernels cannot use LDS variables"); + } + + GlobalVariable *N = + buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func); + + KernelToCreatedDynamicLDS[func] = N; + + markUsedByKernel(Builder, func, N); + + auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0); + auto GEP = ConstantExpr::getGetElementPtr( + emptyCharArray, N, ConstantInt::get(I32, 0), true); + newDynamicLDS.push_back(ConstantExpr::getPtrToInt(GEP, I32)); + } else { + newDynamicLDS.push_back(PoisonValue::get(I32)); + } + } + assert(OrderedKernels.size() == newDynamicLDS.size()); + + ArrayType *t = ArrayType::get(I32, newDynamicLDS.size()); + Constant *init = ConstantArray::get(t, newDynamicLDS); + GlobalVariable *table = new GlobalVariable( + M, t, true, GlobalValue::InternalLinkage, init, + "llvm.amdgcn.dynlds.offset.table", nullptr, + GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS); + + for (GlobalVariable *GV : DynamicVariables) { + for (Use &U : make_early_inc_range(GV->uses())) { + auto *I = dyn_cast(U.getUser()); + if (!I) + continue; + if (isKernelLDS(I->getFunction())) + continue; + + replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr); + } + } + } + return KernelToCreatedDynamicLDS; + } + + static bool canElideModuleLDS(const Function &F) { + return F.hasFnAttribute("amdgpu-elide-module-lds"); + } + + static void markElideModuleLDS(Function &F) { + F.addFnAttr("amdgpu-elide-module-lds"); + } + bool runOnModule(Module &M) override { CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); @@ -939,12 +1070,15 @@ } } + // Partition variables accessed indirectly into the different strategies DenseSet ModuleScopeVariables; DenseSet TableLookupVariables; DenseSet KernelAccessVariables; + DenseSet DynamicVariables; partitionVariablesIntoIndirectStrategies( M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly, - ModuleScopeVariables, TableLookupVariables, KernelAccessVariables); + ModuleScopeVariables, TableLookupVariables, KernelAccessVariables, + DynamicVariables); // If the kernel accesses a variable that is going to be stored in the // module instance through a call then that kernel needs to allocate the @@ -956,6 +1090,10 @@ kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, TableLookupVariables); + DenseSet KernelsThatIndirectlyAllocateDynamicLDS = + kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, + DynamicVariables); + GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables( M, ModuleScopeVariables, KernelsThatAllocateModuleLDS); @@ -981,14 +1119,15 @@ }); } + // The ith element of this vector is kernel id i + std::vector OrderedKernels = + assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS, + KernelsThatIndirectlyAllocateDynamicLDS); + if (!KernelsThatAllocateTableLDS.empty()) { LLVMContext &Ctx = M.getContext(); IRBuilder<> Builder(Ctx); - // The ith element of this vector is kernel id i - std::vector OrderedKernels = - assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS); - for (size_t i = 0; i < OrderedKernels.size(); i++) { markUsedByKernel(Builder, OrderedKernels[i], KernelToReplacement[OrderedKernels[i]].SGV); @@ -1010,6 +1149,67 @@ LookupTable); } + DenseMap KernelToCreatedDynamicLDS = + lowerDynamicLDSVariables(M, LDSUsesInfo, + KernelsThatIndirectlyAllocateDynamicLDS, + DynamicVariables, OrderedKernels); + + // All kernel frames have been allocated. Calculate and record the + // addresses. + + { + const DataLayout &DL = M.getDataLayout(); + + for (Function &Func : M.functions()) { + if (Func.isDeclaration() || !isKernelLDS(&Func)) + continue; + + // All three of these are optional. The first variable is allocated at + // zero. They are allocated by allocateKnownAddressLDSGlobal in the + // following order: + //{ + // module.lds + // alignment padding + // kernel instance + // alignment padding + // dynamic lds variables + //} + + const bool AllocateModuleScopeStruct = + MaybeModuleScopeStruct && !canElideModuleLDS(Func); + + const bool AllocateKernelScopeStruct = + KernelToReplacement.contains(&Func); + + const bool AllocateDynamicVariable = + KernelToCreatedDynamicLDS.contains(&Func); + + uint32_t Offset = 0; + + if (AllocateModuleScopeStruct) { + // Allocated at zero, recorded once on construction, not once per + // kernel + Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType()); + } + + if (AllocateKernelScopeStruct) { + GlobalVariable *KernelStruct = KernelToReplacement[&Func].SGV; + + Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct)); + + recordLDSAbsoluteAddress(&M, KernelStruct, Offset); + } + + if (AllocateDynamicVariable) { + GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func]; + + Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable)); + + recordLDSAbsoluteAddress(&M, DynamicVariable, Offset); + } + } + } + for (auto &GV : make_early_inc_range(M.globals())) if (AMDGPU::isLDSVariableToLower(GV)) { // probably want to remove from used lists diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -111,7 +111,7 @@ Align getDynLDSAlign() const { return DynLDSAlign; } - void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV); + void setDynLDSAlign(const Function &F, const GlobalVariable &GV); }; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -99,6 +99,15 @@ return M->getNamedGlobal(KernelLDSName); } +static const GlobalVariable * +getKernelDynLDSGlobalFromFunction(const Function &F) { + const Module *M = F.getParent(); + std::string KernelDynLDSName = "llvm.amdgcn."; + KernelDynLDSName += F.getName(); + KernelDynLDSName += ".dynlds"; + return M->getNamedGlobal(KernelDynLDSName); +} + // This kernel calls no functions that require the module lds struct static bool canElideModuleLDS(const Function &F) { return F.hasFnAttribute("amdgpu-elide-module-lds"); @@ -131,6 +140,7 @@ const GlobalVariable *GV = M->getNamedGlobal(ModuleLDSName); const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F); + const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F); if (GV && !canElideModuleLDS(F)) { unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align()); @@ -149,6 +159,19 @@ report_fatal_error("Inconsistent metadata on kernel LDS variable"); } } + + if (Dyn) { + // The dynamic LDS is deterministic because the per-kernel one has the + // maximum alignment of any reachable and all remaining LDS variables, + // if this is present, are themselves dynamic LDS and will be allocated + // at the same address. + setDynLDSAlign(F, *Dyn); + unsigned Offset = LDSSize; + std::optional Expect = getLDSAbsoluteAddress(*Dyn); + if (!Expect || (Offset != Expect)) { + report_fatal_error("Inconsistent metadata on dynamic LDS variable"); + } + } } } @@ -187,8 +210,10 @@ return {}; } -void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL, +void AMDGPUMachineFunction::setDynLDSAlign(const Function &F, const GlobalVariable &GV) { + const Module *M = F.getParent(); + const DataLayout &DL = M->getDataLayout(); assert(DL.getTypeAllocSize(GV.getValueType()).isZero()); Align Alignment = @@ -198,4 +223,17 @@ LDSSize = alignTo(StaticLDSSize, Alignment); DynLDSAlign = Alignment; + + // If there is a dynamic LDS variable associated with this function F, every + // further dynamic LDS instance (allocated by calling setDynLDSAlign) must + // map to the same address. This holds because no LDS is allocated after the + // lowering pass if there are dynamic LDS variables present. + const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F); + if (Dyn) { + unsigned Offset = LDSSize; // return this? + std::optional Expect = getLDSAbsoluteAddress(*Dyn); + if (!Expect || (Offset != Expect)) { + report_fatal_error("Inconsistent metadata on dynamic LDS variable"); + } + } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6139,7 +6139,8 @@ if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) { assert(PtrVT == MVT::i32 && "32-bit pointer is expected."); // Adjust alignment for that dynamic shared memory array. - MFI->setDynLDSAlign(DAG.getDataLayout(), *cast(GV)); + Function &F = DAG.getMachineFunction().getFunction(); + MFI->setDynLDSAlign(F, *cast(GV)); return SDValue( DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -79,24 +79,23 @@ if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { return false; } + if (GV.isConstant()) { + // A constant undef variable can't be written to, and any load is + // undef, so it should be eliminated by the optimizer. It could be + // dropped by the back end if not. This pass skips over it. + return false; + } if (!GV.hasInitializer()) { // addrspace(3) without initializer implies cuda/hip extern __shared__ // the semantics for such a variable appears to be that all extern - // __shared__ variables alias one another, in which case this transform - // is not required - return false; + // __shared__ variables alias one another. This hits different handling. + return true; } if (!isa(GV.getInitializer())) { // Initializers are unimplemented for LDS address space. // Leave such variables in place for consistent error reporting. return false; } - if (GV.isConstant()) { - // A constant undef variable can't be written to, and any load is - // undef, so it should be eliminated by the optimizer. It could be - // dropped by the back end if not. This pass skips over it. - return false; - } return true; } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll @@ -18,10 +18,6 @@ @const_undef = addrspace(3) constant i32 undef @const_with_init = addrspace(3) constant i64 8 -; External and constant are both left to the optimizer / error diagnostics -; CHECK: @extern = external addrspace(3) global i32 -@extern = external addrspace(3) global i32 - ; Use of an addrspace(3) variable with an initializer is skipped, ; so as to preserve the unimplemented error from llc ; CHECK: @with_init = addrspace(3) global i64 0 @@ -47,14 +43,12 @@ ; CHECK: %c0 = load i32, ptr addrspace(3) @const_undef, align 4 ; CHECK: %c1 = load i64, ptr addrspace(3) @const_with_init, align 4 ; CHECK: %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst -; CHECK: %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic -; CHECK: %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic +; CHECK: %v1 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic define void @use_variables() { %c0 = load i32, ptr addrspace(3) @const_undef, align 4 %c1 = load i64, ptr addrspace(3) @const_with_init, align 4 %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst - %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic - %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic + %v1 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic ret void }