diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -267,6 +267,10 @@ void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; +ModulePass *createAMDGPUDeviceScopeSharedVariablePass(); +void initializeAMDGPUDeviceScopeSharedVariablePass(PassRegistry &); +extern char &AMDGPUDeviceScopeSharedVariableID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -123,13 +123,15 @@ // should only appear when IPO passes manages to move LDs defined in a kernel // into a single user function. - for (GlobalVariable &GV : M.globals()) { - // TODO: Region address - unsigned AS = GV.getAddressSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) - continue; - - recursivelyVisitUsers(GV, FuncsToAlwaysInline); + if (!AMDGPUTargetMachine::EnableDeviceScopeSharedVariable) { + for (GlobalVariable &GV : M.globals()) { + // TODO: Region address + unsigned AS = GV.getAddressSpace(); + if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) + continue; + + recursivelyVisitUsers(GV, FuncsToAlwaysInline); + } } if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp @@ -0,0 +1,320 @@ +//===-- AMDGPUDeviceScopeSharedVariables.cpp ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// TODO: +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/InitializePasses.h" + +#define DEBUG_TYPE "amdgpu-device-scope-shared-variable" + +using namespace llvm; + +namespace { + +class AMDGPUDeviceScopeSharedVariable : public ModulePass { +public: + static char ID; + + AMDGPUDeviceScopeSharedVariable() : ModulePass(ID) { + initializeAMDGPUDeviceScopeSharedVariablePass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + +} // namespace + +char AMDGPUDeviceScopeSharedVariable::ID = 0; + +char &llvm::AMDGPUDeviceScopeSharedVariableID = + AMDGPUDeviceScopeSharedVariable::ID; + +ModulePass *llvm::createAMDGPUDeviceScopeSharedVariablePass() { + return new AMDGPUDeviceScopeSharedVariable(); +} + +INITIALIZE_PASS_BEGIN(AMDGPUDeviceScopeSharedVariable, + "implement-amdgpu-device-scope-shared-variable", + "Implement AMDPGU Device Scope Shared Variable", + false /*only look at the cfg*/, false /*analysis pass*/) +INITIALIZE_PASS_DEPENDENCY(AMDGPUAlwaysInline) +INITIALIZE_PASS_DEPENDENCY(SimpleInliner) +INITIALIZE_PASS_END(AMDGPUDeviceScopeSharedVariable, + "handle-amdgpu-device-scope-shared-variable", + "Handle AMDPGU Device Scope Shared Variable", + false /*only look at the cfg*/, false /*analysis pass*/) + +static void createFunctionToLDSMap( + ValueMap &LDSToFunction, + ValueMap> + &FunctionToLDS) { + for (auto it = LDSToFunction.begin(); it != LDSToFunction.end(); ++it) { + const GlobalVariable *LDSGlobal = it->first; + const Function *EnclosingFunction = it->second; + auto rit = FunctionToLDS.find(EnclosingFunction); + if (rit == FunctionToLDS.end()) { + SetVector LDSSet; + LDSSet.insert(LDSGlobal); + FunctionToLDS[EnclosingFunction] = LDSSet; + } else + FunctionToLDS[EnclosingFunction].insert(LDSGlobal); + } +} + +static void pairUpKernelWithLDSList( + const Function *K, + ValueMap> &KernelToCallie, + ValueMap> + &FunctionToLDS, + ValueMap> + &KernelToExtendedLDS) { + // Set which holds all the LDS globals which are defined either directly in + // kernel or indirectly in within callie(s) + SetVector ExtendedLDSSet; + + // Collect all the LDS globals defined within kernel + if (FunctionToLDS.find(K) != FunctionToLDS.end()) + ExtendedLDSSet = FunctionToLDS[K]; + + // Collect all the LDS globals defined within the callie(s) of kernel + SetVector Callies = KernelToCallie[K]; + for (const Function *Callie : Callies) { + if (FunctionToLDS.find(Callie) == FunctionToLDS.end()) + continue; + SetVector CallieLDSList = FunctionToLDS[Callie]; + for (const GlobalVariable *CallieLDS : CallieLDSList) + ExtendedLDSSet.insert(CallieLDS); + } + + KernelToExtendedLDS[K] = ExtendedLDSSet; +} + +static void pairUpKernelWithCallieList( + Module &M, const Function *K, + ValueMap> + &FunctionToLDS, + ValueMap> &KernelToCallie) { + // Get the call graph node associated with current kernel, traverse the call + // graph associated with the it in DFS manner and collect all the associated + // callies which define LDS global(s) + CallGraph CG = CallGraph(M); + const CallGraphNode *KernCGNode = CG[K]; + SmallVector CGNodeStack; + SetVector Visited; + +#ifndef NDEBUG + assert(KernCGNode && "Call graph node associated with kernel definition " + "cannot be null\n"); +#endif + + for (auto it = KernCGNode->begin(); it != KernCGNode->end(); ++it) { + const CallGraphNode *CGN = it->second; +#ifndef NDEBUG + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + CGNodeStack.push_back(CGN); + } + + SetVector CallieSet; + while (!CGNodeStack.empty()) { + const CallGraphNode *CGNode = CGNodeStack.pop_back_val(); + if (!Visited.insert(CGNode)) + continue; + + Function *F = CGNode->getFunction(); + if (!F || F->isDeclaration()) { +#ifndef NDEBUG + assert(CGNode->empty() && "Call graph node associated with function " + "declaration should not have callie list\n"); +#endif + continue; + } + + auto fit = FunctionToLDS.find(F); + if (fit != FunctionToLDS.end()) + CallieSet.insert(F); + + for (auto it = CGNode->begin(); it != CGNode->end(); ++it) { + const CallGraphNode *CGN = it->second; +#ifndef NDEBUG + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + CGNodeStack.push_back(CGN); + } + } + + KernelToCallie[K] = CallieSet; +} + +static void pairUpLDSGlobalWithEnclosingFunction( + const GlobalVariable *LDSGlobal, + ValueMap &LDSToFunction) { + // Recursively visit the user list of current LDS global, and find the + // enclosing function where the LDS global is defined, and the enclosing + // function should always be successfully found. + // + // TODO: Is there any other efficient way to find the enclosing functions of + // LDS globals? +#ifndef NDEBUG + assert(!LDSGlobal->user_empty() && + "LDS Global user list cannot be empty since it must have been defined " + "within either kernel or device function"); +#endif + SmallVector UserStack; + SetVector Visited; + + for (const User *U : LDSGlobal->users()) + UserStack.push_back(U); + + while (!UserStack.empty()) { + const User *U = UserStack.pop_back_val(); + if (!Visited.insert(U)) + continue; + + if (const Instruction *I = dyn_cast(U)) { + const Function *F = I->getParent()->getParent(); + if (F) { + LDSToFunction[LDSGlobal] = F; + return; + } + continue; + } + + for (const User *UU : U->users()) + UserStack.push_back(UU); + } +#ifndef NDEBUG + assert(false && "Control is not expected to reach this point"); +#endif +} + +static void +getLDSGlobalSizeInBytes(Module &M, const GlobalVariable *LDSGlobal, + ValueMap &LDSToSize) { + Type *Ty = LDSGlobal->getValueType(); + const DataLayout &DL = M.getDataLayout(); + uint64_t SizeInBytes = DL.getTypeSizeInBits(Ty).getFixedSize() / 8; + LDSToSize[LDSGlobal] = SizeInBytes; +} + +static bool handleDeviceScopeSharedVariables( + Module &M, + ValueMap &LDSToFunction, + ValueMap> + &FunctionToLDS, + ValueMap> &KernelToCallie, + ValueMap> + &KernelToExtendedLDS, + ValueMap &LDSToSize) { + return false; +} + +static bool handleDeviceScopeSharedVariables( + Module &M, const SetVector &LDSGlobals, + const SetVector &Kernels) { + // Pair up each LDS global with the enclosing function where the LDS global is + // defined + ValueMap LDSToFunction; + for (const GlobalVariable *LDSGlobal : LDSGlobals) + pairUpLDSGlobalWithEnclosingFunction(LDSGlobal, LDSToFunction); + + // Create reverse map from enclosing function to LDS global list + ValueMap> FunctionToLDS; + createFunctionToLDSMap(LDSToFunction, FunctionToLDS); + + // Pair up kernels with callie list which define LDS globals + ValueMap> KernelToCallie; + for (const Function *K : Kernels) + pairUpKernelWithCallieList(M, K, FunctionToLDS, KernelToCallie); + + // Pair up kernels with all the associated LDS globals including those + // directly defined within the kernel and those indirectly defined within + // the callies + ValueMap> + KernelToExtendedLDS; + for (const Function *K : Kernels) + pairUpKernelWithLDSList(K, KernelToCallie, FunctionToLDS, + KernelToExtendedLDS); + + // Get the size of each LDS global in bytes + ValueMap LDSToSize; + for (const GlobalVariable *LDSGlobal : LDSGlobals) + getLDSGlobalSizeInBytes(M, LDSGlobal, LDSToSize); + + return handleDeviceScopeSharedVariables(M, LDSToFunction, FunctionToLDS, + KernelToCallie, KernelToExtendedLDS, + LDSToSize); +} + +static bool handleDeviceScopeSharedVariables(Module &M) { + // Collect all the (static) LDS globals defined within the current module + SetVector LDSGlobals; + for (GlobalVariable &GV : M.globals()) + if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + GV.hasInternalLinkage()) + LDSGlobals.insert(&GV); + + if (LDSGlobals.empty()) { + LLVM_DEBUG(dbgs() << "No LDS globals defined in the module " << M.getName() + << ", skipping handling device of scope shared variables" + << "\n"); + return false; + } + + // Collect all the amdgpu kernels defined within the current module + SetVector Kernels; + for (Function &F : M.functions()) { + if ((F.getCallingConv() == CallingConv::AMDGPU_KERNEL) && + !F.isDeclaration()) + Kernels.insert(&F); + } + + if (Kernels.empty()) { + LLVM_DEBUG(dbgs() << "No kernels defined in the module " << M.getName() + << ", skipping handling of device scope shared variables" + << "\n"); + return false; + } + + return handleDeviceScopeSharedVariables(M, LDSGlobals, Kernels); +} + +bool AMDGPUDeviceScopeSharedVariable::runOnModule(Module &M) { + LLVM_DEBUG(dbgs() << "===== Handling device scope shared variables in the " + "module " + << M.getName() << "\n"); + + // TODO: We only want to handle HIP kernels, and no kernels from from other + // programming languages, like OpenCL, OpenMP, etc. Do we need to add a + // condition here for it, and skip running the pass for non-HIP kernels? + if (skipModule(M)) { + LLVM_DEBUG(dbgs() << "Skipping handling of device scope shared variables " + "in the module " + << M.getName() << "\n"); + return false; + } + + bool Changed = handleDeviceScopeSharedVariables(M); + + LLVM_DEBUG(dbgs() << "===== Done with hanlding device scope shared variables " + "in the module " + << M.getName() << "\n"); + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -40,6 +40,7 @@ static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; static bool EnableFixedFunctionABI; + static bool EnableDeviceScopeSharedVariable; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -198,6 +198,12 @@ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); +static cl::opt EnableDeviceScopeSharedVariable( + "amdgpu-enable-device-scope-shared-variable", + cl::desc("Support amdgpu device scope shared variables"), + cl::location(AMDGPUTargetMachine::EnableDeviceScopeSharedVariable), + cl::init(false), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -264,6 +270,7 @@ initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); initializeSIAddIMGInitPass(*PR); + initializeAMDGPUDeviceScopeSharedVariablePass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -393,6 +400,7 @@ bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; +bool AMDGPUTargetMachine::EnableDeviceScopeSharedVariable = false; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -770,6 +778,11 @@ // but EarlyCSE can do neither of them. if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses) addEarlyCSEOrGVNPass(); + + // We expect to run this pass as a last IR pass. Hence make sure that this + // pass is added as a last IR pass + if (EnableDeviceScopeSharedVariable) + addPass(createAMDGPUDeviceScopeSharedVariablePass()); } void AMDGPUPassConfig::addCodeGenPrepare() { diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -48,6 +48,7 @@ AMDGPUAtomicOptimizer.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp + AMDGPUDeviceScopeSharedVariable.cpp AMDGPUExportClustering.cpp AMDGPUFixFunctionBitcasts.cpp AMDGPUFrameLowering.cpp