diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -48,10 +48,99 @@ namespace { -class AMDGPULowerModuleLDS : public ModulePass { +class LowerModuleLDSImpl { + Module &M; + LLVMContext &Ctx; + const DataLayout &DL; + + // Sort by alignment, descending, to minimise padding. On ties, sort by size, + // descending, then by name, lexicographical. + void sortLocalVars(std::vector &FoundLocalVars) { + llvm::stable_sort( + FoundLocalVars, + [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool { + Align ALHS = AMDGPU::getAlign(DL, LHS); + Align ARHS = AMDGPU::getAlign(DL, RHS); + if (ALHS != ARHS) { + return ALHS > ARHS; + } + + TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType()); + TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType()); + if (SLHS != SRHS) { + return SLHS > SRHS; + } + + // By variable name on tie for predictable order in test cases. + return LHS->getName() < RHS->getName(); + }); + } + + std::vector insertPaddingVarsWithinSortedLocalVarsList( + std::vector &FoundLocalVars) { + std::vector LocalVars; + LocalVars.reserve(FoundLocalVars.size()); // will be at least this large + + // This usually won't need to insert any padding, perhaps avoid the alloc + uint64_t CurrentOffset = 0; + for (size_t I = 0; I < FoundLocalVars.size(); I++) { + GlobalVariable *FGV = FoundLocalVars[I]; + Align DataAlign = AMDGPU::getAlign(DL, FGV); + + uint64_t DataAlignV = DataAlign.value(); + if (uint64_t Rem = CurrentOffset % DataAlignV) { + uint64_t Padding = DataAlignV - Rem; + + // Append an array of padding bytes to meet alignment requested + // Note (o + (a - (o % a)) ) % a == 0 + // (offset + Padding ) % align == 0 + + Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding); + LocalVars.push_back(new GlobalVariable( + M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), + "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, + false)); + CurrentOffset += Padding; + } + + LocalVars.push_back(FGV); + CurrentOffset += DL.getTypeAllocSize(FGV->getValueType()); + } + + return LocalVars; + } + + std::pair + createNewStructTypeAndItsInstance(std::vector &LocalVars) { + std::vector LocalVarTypes; + LocalVarTypes.reserve(LocalVars.size()); + std::transform( + LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes), + [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); + + StructType *LDSTy = StructType::create( + Ctx, LocalVarTypes, llvm::StringRef("llvm.amdgcn.module.lds.t")); + + Align MaxAlign = + AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment + + GlobalVariable *SGV = new GlobalVariable( + M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy), + "llvm.amdgcn.module.lds", nullptr, GlobalValue::NotThreadLocal, + AMDGPUAS::LOCAL_ADDRESS, false); - static void removeFromUsedList(Module &M, StringRef Name, - SmallPtrSetImpl &ToRemove) { + SGV->setAlignment(MaxAlign); + + appendToCompilerUsed( + M, {static_cast( + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast(SGV), Type::getInt8PtrTy(Ctx)))}); + + return std::make_pair(LDSTy, SGV); + } + + void removeFromUsedList(Module &M, StringRef Name, + SmallPtrSetImpl &ToRemove) { GlobalVariable *GV = M.getGlobalVariable(Name); if (!GV || ToRemove.empty()) { return; @@ -83,9 +172,8 @@ } } - static void - removeFromUsedLists(Module &M, - const std::vector &LocalVars) { + void removeFromUsedLists(Module &M, + const std::vector &LocalVars) { SmallPtrSet LocalVarsSet; for (size_t I = 0; I < LocalVars.size(); I++) { if (Constant *C = dyn_cast(LocalVars[I]->stripPointerCasts())) { @@ -96,8 +184,26 @@ removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet); } - static void markUsedByKernel(IRBuilder<> &Builder, Function *Func, - GlobalVariable *SGV) { + void replaceUsesOfLocalVars(std::vector &LocalVars, + StructType *LDSTy, GlobalVariable *SGV) { + // Replace uses of ith variable with a constantexpr to the ith field of the + // instance that will be allocated by AMDGPUMachineFunction + Constant *InstanceAddress = Constant::getIntegerValue( + PointerType::get(LDSTy, AMDGPUAS::LOCAL_ADDRESS), APInt(32, 0)); + + Type *I32 = Type::getInt32Ty(Ctx); + + for (size_t I = 0; I < LocalVars.size(); I++) { + GlobalVariable *GV = LocalVars[I]; + Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; + GV->replaceAllUsesWith( + ConstantExpr::getGetElementPtr(LDSTy, InstanceAddress, GEPIdx)); + GV->eraseFromParent(); + } + } + + void markUsedByKernel(IRBuilder<> &Builder, Function *Func, + GlobalVariable *SGV) { // The llvm.amdgcn.module.lds instance is implicitly used by all kernels // that might call a function which accesses a field within it. This is // presently approximated to 'all kernels' if there are any such functions @@ -129,101 +235,44 @@ ""); } -public: - static char ID; - - AMDGPULowerModuleLDS() : ModulePass(ID) { - initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry()); + void markUsedByKernel(GlobalVariable *SGV) { + IRBuilder<> Builder(Ctx); + SmallPtrSet Kernels; + for (auto &I : M.functions()) { + Function *Func = &I; + if (AMDGPU::isKernelCC(Func) && !Kernels.contains(Func)) { + markUsedByKernel(Builder, Func, SGV); + Kernels.insert(Func); + } + } } - bool runOnModule(Module &M) override { - LLVMContext &Ctx = M.getContext(); - const DataLayout &DL = M.getDataLayout(); - SmallPtrSet UsedList = AMDGPU::getUsedList(M); +public: + explicit LowerModuleLDSImpl(Module &M) + : M(M), Ctx(M.getContext()), DL(M.getDataLayout()) {} - // Find variables to move into new struct instance + bool lower() { + // Find variables to move into new struct instance. std::vector FoundLocalVars = - AMDGPU::findVariablesToLower(M, UsedList); + AMDGPU::findVariablesToLower(M); if (FoundLocalVars.empty()) { // No variables to rewrite, no changes made. return false; } - // Sort by alignment, descending, to minimise padding. - // On ties, sort by size, descending, then by name, lexicographical. - llvm::stable_sort( - FoundLocalVars, - [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool { - Align ALHS = AMDGPU::getAlign(DL, LHS); - Align ARHS = AMDGPU::getAlign(DL, RHS); - if (ALHS != ARHS) { - return ALHS > ARHS; - } - - TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType()); - TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType()); - if (SLHS != SRHS) { - return SLHS > SRHS; - } - - // By variable name on tie for predictable order in test cases. - return LHS->getName() < RHS->getName(); - }); - - std::vector LocalVars; - LocalVars.reserve(FoundLocalVars.size()); // will be at least this large - { - // This usually won't need to insert any padding, perhaps avoid the alloc - uint64_t CurrentOffset = 0; - for (size_t I = 0; I < FoundLocalVars.size(); I++) { - GlobalVariable *FGV = FoundLocalVars[I]; - Align DataAlign = AMDGPU::getAlign(DL, FGV); - - uint64_t DataAlignV = DataAlign.value(); - if (uint64_t Rem = CurrentOffset % DataAlignV) { - uint64_t Padding = DataAlignV - Rem; - - // Append an array of padding bytes to meet alignment requested - // Note (o + (a - (o % a)) ) % a == 0 - // (offset + Padding ) % align == 0 - - Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding); - LocalVars.push_back(new GlobalVariable( - M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), - "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, - false)); - CurrentOffset += Padding; - } - - LocalVars.push_back(FGV); - CurrentOffset += DL.getTypeAllocSize(FGV->getValueType()); - } - } - - std::vector LocalVarTypes; - LocalVarTypes.reserve(LocalVars.size()); - std::transform( - LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes), - [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); - - StructType *LDSTy = StructType::create( - Ctx, LocalVarTypes, llvm::StringRef("llvm.amdgcn.module.lds.t")); + // Sort by alignment, descending, to minimise padding. On ties, sort by + // size, descending, then by name, lexicographical. + sortLocalVars(FoundLocalVars); - Align MaxAlign = - AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment - Constant *InstanceAddress = Constant::getIntegerValue( - PointerType::get(LDSTy, AMDGPUAS::LOCAL_ADDRESS), APInt(32, 0)); + // Insert needed padding variables. + std::vector LocalVars = + insertPaddingVarsWithinSortedLocalVarsList(FoundLocalVars); - GlobalVariable *SGV = new GlobalVariable( - M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy), - "llvm.amdgcn.module.lds", nullptr, GlobalValue::NotThreadLocal, - AMDGPUAS::LOCAL_ADDRESS, false); - SGV->setAlignment(MaxAlign); - appendToCompilerUsed( - M, {static_cast( - ConstantExpr::getPointerBitCastOrAddrSpaceCast( - cast(SGV), Type::getInt8PtrTy(Ctx)))}); + // Construct new struct type and its global instance. + StructType *LDSTy; + GlobalVariable *SGV; + std::tie(LDSTy, SGV) = createNewStructTypeAndItsInstance(LocalVars); // The verifier rejects used lists containing an inttoptr of a constant // so remove the variables from these lists before replaceAllUsesWith @@ -231,35 +280,31 @@ // Replace uses of ith variable with a constantexpr to the ith field of the // instance that will be allocated by AMDGPUMachineFunction - Type *I32 = Type::getInt32Ty(Ctx); - for (size_t I = 0; I < LocalVars.size(); I++) { - GlobalVariable *GV = LocalVars[I]; - Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; - GV->replaceAllUsesWith( - ConstantExpr::getGetElementPtr(LDSTy, InstanceAddress, GEPIdx)); - GV->eraseFromParent(); - } + replaceUsesOfLocalVars(LocalVars, LDSTy, SGV); // Mark kernels with asm that reads the address of the allocated structure // This is not necessary for lowering. This lets other passes, specifically // PromoteAlloca, accurately calculate how much LDS will be used by the // kernel after lowering. - { - IRBuilder<> Builder(Ctx); - SmallPtrSet Kernels; - for (auto &I : M.functions()) { - Function *Func = &I; - if (AMDGPU::isKernelCC(Func) && !Kernels.contains(Func)) { - markUsedByKernel(Builder, Func, SGV); - Kernels.insert(Func); - } - } - } + markUsedByKernel(SGV); + return true; } }; +class AMDGPULowerModuleLDS : public ModulePass { +public: + static char ID; + + AMDGPULowerModuleLDS() : ModulePass(ID) { + initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + } // namespace + char AMDGPULowerModuleLDS::ID = 0; char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID; @@ -268,12 +313,17 @@ "Lower uses of LDS variables from non-kernel functions", false, false) +bool AMDGPULowerModuleLDS::runOnModule(Module &M) { + LowerModuleLDSImpl Lowerer(M); + return Lowerer.lower(); +} + ModulePass *llvm::createAMDGPULowerModuleLDSPass() { return new AMDGPULowerModuleLDS(); } PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M, ModuleAnalysisManager &) { - return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none() - : PreservedAnalyses::all(); + LowerModuleLDSImpl Lowerer(M); + return Lowerer.lower() ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -26,8 +26,7 @@ bool userRequiresLowering(const SmallPtrSetImpl &UsedList, User *InitialUser); -std::vector -findVariablesToLower(Module &M, const SmallPtrSetImpl &UsedList); +std::vector findVariablesToLower(Module &M); SmallPtrSet getUsedList(Module &M); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -72,9 +72,7 @@ return false; } -std::vector -findVariablesToLower(Module &M, - const SmallPtrSetImpl &UsedList) { +std::vector findVariablesToLower(Module &M) { std::vector LocalVars; for (auto &GV : M.globals()) { if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { @@ -99,7 +97,7 @@ continue; } if (std::none_of(GV.user_begin(), GV.user_end(), [&](User *U) { - return userRequiresLowering(UsedList, U); + return userRequiresLowering(AMDGPU::getUsedList(M), U); })) { continue; }