diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -337,6 +337,14 @@ void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; +ModulePass *createAMDGPULowerLDSGlobalPass(); +void initializeAMDGPULowerLDSGlobalPass(PassRegistry &); +extern char &AMDGPULowerLDSGlobalID; +struct AMDGPULowerLDSGlobalPass : PassInfoMixin { + AMDGPULowerLDSGlobalPass() {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -117,13 +117,15 @@ // should only appear when IPO passes manages to move LDs defined in a kernel // into a single user function. - for (GlobalVariable &GV : M.globals()) { - // TODO: Region address - unsigned AS = GV.getAddressSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) - continue; - - recursivelyVisitUsers(GV, FuncsToAlwaysInline); + if (!AMDGPUTargetMachine::EnableLDSGlobalLowering) { + for (GlobalVariable &GV : M.globals()) { + // TODO: Region address + unsigned AS = GV.getAddressSpace(); + if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) + continue; + + recursivelyVisitUsers(GV, FuncsToAlwaysInline); + } } if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerLDSGlobal.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerLDSGlobal.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerLDSGlobal.cpp @@ -0,0 +1,501 @@ +//===-- AMDGPULowerLDSGlobal.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include +#include + +#define DEBUG_TYPE "amdgpu-lower-lds-global" + +using namespace llvm; + +/// Wrapper function around `ValueMap` to detect if an element exists within it. +template +static bool contains(R &&VMap, const E &Element) { + return VMap.find(Element) != VMap.end(); +} + +// Traverse through the call graph nodes associated with the callees of current +// caller, and push them into stack. +static void pushCallGraphNodes(CallGraphNode *CGNode, + SmallVectorImpl &CGNodeStack, + SmallVectorImpl &CallBaseStack) { + assert(CGNode && + "Call graph node associated with kernel/function definition cannot be " + "null"); + for (auto GI = CGNode->begin(), GE = CGNode->end(); GI != GE; ++GI) { + auto *CGN = GI->second; + assert(CGN && + "Call graph node associated with kernel/function definition cannot " + "be null"); + auto *CB = cast(GI->first.getValue()); + CGNodeStack.push_back(CGN); + CallBaseStack.push_back(CB); + } +} + +namespace { + +class LowerLDSGlobalImpl { +public: + explicit LowerLDSGlobalImpl(Module &M) : M(M), CG(CallGraph(M)) { + // Collect the functions whose address is taken within the module. + collectAddressTakenFunctions(); + } + + // Entry-point function. + bool lower(); + +private: + Module &M; + CallGraph CG; + SmallPtrSet Kernels; + SmallPtrSet LDSGlobals; + SmallPtrSet AddressTakenSet; + ValueMap> LDSGlobalToAccessors; + ValueMap> AccessorToLDSGlobals; + ValueMap> KernelToCallees; + ValueMap> KernelToLDSGlobals; + + // Prune original kernel and LDS set based on constructed kernels to LDS + // globals map. + void pruneKernelAndLDSSet(); + + // Associate each kernel K with LDS globals which are being accessed by K + // and/or by the callees of K. + void createKernelToLDSGlobalsMap(); + + // The call site associated with `CGNode` is a "direct call site", and the + // information about the corresponding callee, say, `Callee` is available. + // Check if `Callee` defines LDS variables within it, if so, add it to + // `CalleeSet`, and push callees of `Callee` to `CGNodeStack` to continue the + // DFS search. + void handleDirectCallSite(CallGraphNode *CGNode, + SmallVectorImpl &CGNodeStack, + SmallVectorImpl &CallBaseStack, + SmallPtrSetImpl &CalleeSet); + + // The call site `CB` associated with the call graph node `CGNode` is an + // "indirect call site". Depending on whether the metadata `!callee` is + // available at `CB` or not, we need to handle it accordingly. + void handleIndirectCallSite(CallGraphNode *CGNode, CallBase *CB, + SmallVectorImpl &CGNodeStack, + SmallVectorImpl &CallBaseStack, + SmallPtrSetImpl &CalleeSet); + + // Handle call site `CB` depending on whether it is a direct or an indirect + // call site, return true if an indirect call site is being handled for the + // first time. + bool handleCallSite(CallGraphNode *CGNode, CallBase *CB, + SmallVectorImpl &CGNodeStack, + SmallVectorImpl &CallBaseStack, + SmallPtrSetImpl &CalleeSet); + + // Traverse `CallGraph` starting from the `CallGraphNode` associated with each + // kernel `K` in DFS manner and collect all the callees which are reachable + // from K (including indirect callees). + void createKernelToCalleesMap(); + + // Associate each kernel/function with the LDS globals which are being + // accessed within them. + void createAccessorToLDSGlobalsMap(); + + // For each `LDS`, recursively visit its user list and find all those + // kernels/functions within which the `LDS` is being accessed. + void createLDSGlobalToAccessorsMap(); + + // For each kernel `K`, collect LDS globals which are being accessed during + // the execution of `K`. + bool collectPerKernelAccessibleLDSGlobals(); + + // Collect all the amdgpu kernels defined within the current module. + bool collectKernels(); + + // Collect all the (static) LDS globals defined within the current module. + bool collectLDSGlobals(); + + // Collect functions whose address is taken within the module. + void collectAddressTakenFunctions(); +}; + +// Prune original kernel and LDS set based on constructed kernels to LDS globals +// map. +void LowerLDSGlobalImpl::pruneKernelAndLDSSet() { + SmallPtrSet ToBeRetainedKernels; + SmallPtrSet ToBeRetainedLDSGlobals; + + for (auto KI = KernelToLDSGlobals.begin(), KE = KernelToLDSGlobals.end(); + KI != KE; ++KI) { + ToBeRetainedKernels.insert(KI->first); + for (auto *LDS : KI->second) + ToBeRetainedLDSGlobals.insert(LDS); + } + + Kernels.swap(ToBeRetainedKernels); + LDSGlobals.swap(ToBeRetainedLDSGlobals); +} + +// Associate each kernel K with LDS globals which are being accessed by K and/or +// by the callees of K. +void LowerLDSGlobalImpl::createKernelToLDSGlobalsMap() { + for (auto *K : Kernels) { + SmallPtrSet LDSSet; + + // Insert all those LDS globals which are being accessed by kernel K itself. + if (contains(AccessorToLDSGlobals, K)) + LDSSet.insert(AccessorToLDSGlobals[K].begin(), + AccessorToLDSGlobals[K].end()); + + // Insert all those LDS globals which are being accessed by the callees of + // kernel K. + for (auto *Callee : KernelToCallees[K]) { + if (contains(AccessorToLDSGlobals, Callee)) + LDSSet.insert(AccessorToLDSGlobals[Callee].begin(), + AccessorToLDSGlobals[Callee].end()); + } + + if (!LDSSet.empty()) + KernelToLDSGlobals[K] = LDSSet; + } +} + +// The call site associated with `CGNode` is a "direct call site", and the +// information about the corresponding callee, say, `Callee` is available. +// If `Callee` is definition, then add it to `CalleeSet`, and push callees +// of `Callee` to `CGNodeStack` to continue the DFS search. +void LowerLDSGlobalImpl::handleDirectCallSite( + CallGraphNode *CGNode, SmallVectorImpl &CGNodeStack, + SmallVectorImpl &CallBaseStack, + SmallPtrSetImpl &CalleeSet) { + auto *Callee = CGNode->getFunction(); + assert(Callee && "Expected a valid callee associated with call site"); + if (!Callee->isDeclaration()) { + CalleeSet.insert(Callee); + pushCallGraphNodes(CGNode, CGNodeStack, CallBaseStack); + } +} + +// The call site `CB` associated with the call graph node `CGNode` is an +// "indirect call site". Depending on whether the metadata `!callee` is +// available at `CB` or not, we need to handle it accordingly. +void LowerLDSGlobalImpl::handleIndirectCallSite( + CallGraphNode *CGNode, CallBase *CB, + SmallVectorImpl &CGNodeStack, + SmallVectorImpl &CallBaseStack, + SmallPtrSetImpl &CalleeSet) { + if (auto *MD = CB->getMetadata(LLVMContext::MD_callees)) { + // The metadata "!callee" is available at the indirect call site `CB`, which + // means, all the potential target callees for the call site `CB` is + // successfully resolved at compile time. So, push them into stack so that + // they will be handled just like direct callees when they are eventually + // poped out. + for (const auto &Op : MD->operands()) { + auto *CGN = CG[mdconst::extract_or_null(Op)]; + assert(CGN && + "Call graph node associated with kernel/function definition cannot" + " be null"); + assert(CGN->getFunction() && + "Expected a valid function which is included within !callee " + "metadata"); + CGNodeStack.push_back(CGN); + CallBaseStack.push_back(CB); + } + } else { + // The metadata "!callee" is *NOT* available at the indirect call site `CB`, + // which means, `CB` has *NO* information about potential target callees. + // The simplest possible *SAFE* assumption that we can make here is to + // consider all those "address taken" functions whose singature matches with + // that of the call site `CB`, and assume that all these signature matched + // "address taken" functions are possible potential callees. Thus, push all + // the signature matched "address taken" functions into stack so that they + // will be handled just like direct callees when they are eventually poped + // out. + auto *CBFTy = CB->getFunctionType(); + for (auto *CGN : AddressTakenSet) { + auto *F = CGN->getFunction(); + assert(F && "Expected a valid address taken function"); + auto *ADFTy = F->getFunctionType(); + if (ADFTy == CBFTy) { + CGNodeStack.push_back(CGN); + CallBaseStack.push_back(CB); + } + } + } +} + +// Handle the call site `CB` depending on whether it is a direct or an indirect +// call site, return true if an indirect call site is being handled for the +// first time. +bool LowerLDSGlobalImpl::handleCallSite( + CallGraphNode *CGNode, CallBase *CB, + SmallVectorImpl &CGNodeStack, + SmallVectorImpl &CallBaseStack, + SmallPtrSetImpl &CalleeSet) { + bool IndirectCallSite = false; + + if (!CB->getCalledFunction()) { + // Call site `CB` is an indirect call site. But, if the `CGNode` has a + // function defintion, say, `F`, associated with it, which means, we have + // already had encountered `CB` earlier, and had resovled it to a set of + // pontential callees, and `F` is one among them. Handle `CB` just like + // direct callees are being handled. Otherwise, `CB` is encoutered for first + // time, resolve it to a set of potential callees before handling them. + if (!CGNode->getFunction()) { + // Indirect call site `CB` is encoutered first time, resolve it to a set + // of potential callees. + handleIndirectCallSite(CGNode, CB, CGNodeStack, CallBaseStack, CalleeSet); + IndirectCallSite = true; + } else { + // Indirect call site `CB` is already associated with a set of potential + // callees. A callee represented by `CGNode` is one among them, handle it + // just like direct callees are being handled. + handleDirectCallSite(CGNode, CGNodeStack, CallBaseStack, CalleeSet); + } + } else { + // Call site `CB` is a direct call site. Handle callee associated with it. + handleDirectCallSite(CGNode, CGNodeStack, CallBaseStack, CalleeSet); + } + + return IndirectCallSite; +} + +// Traverse `CallGraph` starting from the `CallGraphNode` associated with each +// kernel `K` in DFS manner and collect all the callees which are reachable from +// K (including indirect callees). +void LowerLDSGlobalImpl::createKernelToCalleesMap() { + for (auto *K : Kernels) { + auto *KernCGNode = CG[K]; + SmallVector CGNodeStack; + SmallVector CallBaseStack; + SmallPtrSet Visited; + SmallPtrSet CalleeSet; + + // Push the `CallGraphNode` associated with all the callees of the kernel`K` + // into into `CGNodeStack`, and the corresponding call sites into + // `CallBaseStack`. + pushCallGraphNodes(KernCGNode, CGNodeStack, CallBaseStack); + + // Continue DFS search until no more call graph nodes to handle. + while (!CGNodeStack.empty()) { + assert(CGNodeStack.size() == CallBaseStack.size() && + "Stack holding CallBase pointers is currupted"); + auto *CGNode = CGNodeStack.pop_back_val(); + auto *CB = CallBaseStack.pop_back_val(); + + // `CGNode` is already visited and handled, ignore it and proceed to next + // one. + if (!Visited.insert(CGNode).second) + continue; + + if (handleCallSite(CGNode, CB, CGNodeStack, CallBaseStack, CalleeSet)) { + // The call site `CB` is an indirect call site which is being handled + // for the *first time*, where it is resovled to a set of potential + // callees. Since the indirect call site does not bind to any particular + // function, and the `CGNode` is same for all the function pointers + // which have same signature, we should *NOT* assume that `CGNode` is + // visited unlike in case of direct call site. + Visited.erase(CGNode); + } + } + + assert(CallBaseStack.empty() && + "Stack holding CallBase pointers is currupted"); + + KernelToCallees[K] = CalleeSet; + } +} + +// Associate each kernel/function with the LDS globals which are being accessed +// within them. +void LowerLDSGlobalImpl::createAccessorToLDSGlobalsMap() { + for (auto LI = LDSGlobalToAccessors.begin(), LE = LDSGlobalToAccessors.end(); + LI != LE; ++LI) { + auto *LDS = LI->first; + for (auto *A : LI->second) { + if (!contains(AccessorToLDSGlobals, A)) { + SmallPtrSet LDSSet; + LDSSet.insert(LDS); + AccessorToLDSGlobals[A] = LDSSet; + } else + AccessorToLDSGlobals[A].insert(LDS); + } + } +} + +// For each `LDS`, recursively visit its user list and find all those +// kernels/functions within which the `LDS` is being accessed. +void LowerLDSGlobalImpl::createLDSGlobalToAccessorsMap() { + for (auto *LDS : LDSGlobals) { + assert(!LDS->user_empty() && + "LDS user list cannot be empty since it must have been successfully " + "defined within either kernel or function"); + + SmallPtrSet LDSAccessors; + SmallVector UserStack(LDS->users()); + SmallPtrSet Visited; + + while (!UserStack.empty()) { + auto *U = UserStack.pop_back_val(); + if (!Visited.insert(U).second) + continue; + + auto *I = dyn_cast(U); + + // If `U` is not an `Instruction`, then it should be a `Constant` which is + // nested within an `Instruction`. Push-back users of `U`, and continue + // further exploring the stack until an `Instruction` is found. + if (!I) { + assert(isa(U) && "Expected a constant expression"); + append_range(UserStack, U->users()); + continue; + } + + // We have successfully found a kernel/function within which the `LDS` is + // being accessed, insert it into `LDSAccessors` set. + LDSAccessors.insert(I->getParent()->getParent()); + } + + LDSGlobalToAccessors[LDS] = LDSAccessors; + } +} + +// For each kernel `K`, collect LDS globals which are being accessed during the +// execution of `K`. +bool LowerLDSGlobalImpl::collectPerKernelAccessibleLDSGlobals() { + // Associate each LDS with the kernels/functions within which the LDS is being + // accessed. + createLDSGlobalToAccessorsMap(); + + // Associate each kernel/function with the LDS globals which are being + // accessed within them. + createAccessorToLDSGlobalsMap(); + + // Associate each kernel K with callees which are reachable from K (including + // indirect callees). + createKernelToCalleesMap(); + + // Associate each kernel K with LDS globals which are being accessed by K + // and/or by the callees of K. + createKernelToLDSGlobalsMap(); + + // If *none* of the kernels associate with any LDS globals, then nothing do. + return !KernelToLDSGlobals.empty(); +} + +// Collect all the amdgpu kernels defined within the current module. +bool LowerLDSGlobalImpl::collectKernels() { + for (auto &F : M.functions()) { + if (AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()) && + !F.isDeclaration()) + Kernels.insert(&F); + } + + return !Kernels.empty(); +} + +// Collect all the (static) LDS globals defined within the current module. +bool LowerLDSGlobalImpl::collectLDSGlobals() { + for (auto &GV : M.globals()) { + if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + !GV.isDeclaration() && !GV.getType()->isEmptyTy()) + LDSGlobals.insert(&GV); + } + + return !LDSGlobals.empty(); +} + +// Collect functions whose address is taken within the module. +void LowerLDSGlobalImpl::collectAddressTakenFunctions() { + auto *ExternalCallingNode = CG.getExternalCallingNode(); + assert(ExternalCallingNode && + "Call graph node associated with kernel/function definition cannot be " + "null"); + + for (auto GI = ExternalCallingNode->begin(), GE = ExternalCallingNode->end(); + GI != GE; ++GI) { + auto *CGN = GI->second; + assert(CGN && + "Call graph node associated with kernel/function definition cannot " + "be null"); + auto *F = CGN->getFunction(); + // FIXME: Anything else need to be excluded? + if (!F || F->isDeclaration() || + AMDGPU::isModuleEntryFunctionCC(F->getCallingConv())) + continue; + AddressTakenSet.insert(CGN); + } +} + +// Entry-point function. +bool LowerLDSGlobalImpl::lower() { + // If there are *no* LDS globals defined within the module, or if there are + // *no* kernels defined within the module, or if there exist *no* kernel + // *execution* which accesses LDS globals at run time, then nothing to do. + if (!collectLDSGlobals() || !collectKernels() || + !collectPerKernelAccessibleLDSGlobals()) + return false; + + // Prune original kernel and LDS set based on constructed kernels to LDS + // globals map. + pruneKernelAndLDSSet(); + + // FIXME: It should return 'true' once the lowering code is added in the later + // patch. + return false; +} + +class AMDGPULowerLDSGlobal : public ModulePass { +public: + static char ID; + + AMDGPULowerLDSGlobal() : ModulePass(ID) { + initializeAMDGPULowerLDSGlobalPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + +} // namespace + +char AMDGPULowerLDSGlobal::ID = 0; +char &llvm::AMDGPULowerLDSGlobalID = AMDGPULowerLDSGlobal::ID; + +INITIALIZE_PASS(AMDGPULowerLDSGlobal, "amdgpu-lower-lds-global", + "Lower LDS Global Variables", false /*only look at the cfg*/, + false /*analysis pass*/) + +bool AMDGPULowerLDSGlobal::runOnModule(Module &M) { + LowerLDSGlobalImpl LDSLowerer{M}; + return LDSLowerer.lower(); +} + +ModulePass *llvm::createAMDGPULowerLDSGlobalPass() { + return new AMDGPULowerLDSGlobal(); +} + +PreservedAnalyses AMDGPULowerLDSGlobalPass::run(Module &M, + ModuleAnalysisManager &AM) { + LowerLDSGlobalImpl LDSLowerer{M}; + LDSLowerer.lower(); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -35,6 +35,7 @@ static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; static bool EnableFixedFunctionABI; + static bool EnableLDSGlobalLowering; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -193,6 +193,12 @@ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); +static cl::opt EnableLDSGlobalLowering( + "amdgpu-enable-lds-global-lowering", + cl::desc("Enable LDS global variable lowering pass"), + cl::location(AMDGPUTargetMachine::EnableLDSGlobalLowering), cl::init(true), + cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -259,6 +265,7 @@ initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); initializeSIAddIMGInitPass(*PR); + initializeAMDGPULowerLDSGlobalPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -388,6 +395,7 @@ bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; +bool AMDGPUTargetMachine::EnableLDSGlobalLowering = false; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -501,6 +509,10 @@ PM.addPass(AMDGPUAlwaysInlinePass()); return true; } + if (PassName == "amdgpu-lower-lds-global") { + PM.addPass(AMDGPULowerLDSGlobalPass()); + return true; + } return false; }); PB.registerPipelineParsingCallback( @@ -847,6 +859,12 @@ disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); + // We expect to run this pass as a first AMDGPU IR pass so that new + // instructions being added in this pass can possibly undergo further + // transformations via subsequent passes. + if (EnableLDSGlobalLowering) + addPass(createAMDGPULowerLDSGlobalPass()); + addPass(createAMDGPUPrintfRuntimeBinding()); // This must occur before inlining, as the inliner will not look through diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -67,6 +67,7 @@ AMDGPULowerIntrinsics.cpp AMDGPULowerKernelArguments.cpp AMDGPULowerKernelAttributes.cpp + AMDGPULowerLDSGlobal.cpp AMDGPUMachineCFGStructurizer.cpp AMDGPUMachineFunction.cpp AMDGPUMachineModuleInfo.cpp