diff --git a/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h b/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h --- a/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h +++ b/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h @@ -17,6 +17,9 @@ namespace omp { +/// Summary of a kernel (=entry point for target offloading). +using Kernel = Function *; + /// Helper to remember if the module contains OpenMP (runtime calls), to be used /// foremost with containsOpenMP. struct OpenMPInModule { @@ -30,8 +33,17 @@ bool isKnown() { return Value != OpenMP::UNKNOWN; } operator bool() { return Value != OpenMP::NOT_FOUND; } + /// Return the known kernels (=GPU entry points) in the module. + SmallPtrSetImpl &getKernels() { return Kernels; } + + /// Identify kernels in the module and populate the Kernels set. + void identifyKernels(Module &M); + private: enum class OpenMP { FOUND, NOT_FOUND, UNKNOWN } Value = OpenMP::UNKNOWN; + + /// Collection of known kernels (=GPU entry points) in the module. + SmallPtrSet Kernels; }; /// Helper to determine if \p M contains OpenMP (runtime calls). diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -39,6 +39,8 @@ static cl::opt PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden); +static cl::opt PrintOpenMPKernels("openmp-print-gpu-kernels", + cl::init(false), cl::Hidden); STATISTIC(NumOpenMPRuntimeCallsDeduplicated, "Number of OpenMP runtime calls deduplicated"); @@ -48,6 +50,8 @@ "Number of OpenMP runtime functions identified"); STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified, "Number of OpenMP runtime function uses identified"); +STATISTIC(NumOpenMPTargetRegionKernels, + "Number of OpenMP target region entry points (=kernels) identified"); #if !defined(NDEBUG) static constexpr auto TAG = "[" DEBUG_TYPE "]"; @@ -99,9 +103,10 @@ struct OMPInformationCache : public InformationCache { OMPInformationCache(Module &M, AnalysisGetter &AG, BumpPtrAllocator &Allocator, SetVector *CGSCC, - SmallPtrSetImpl &ModuleSlice) + SmallPtrSetImpl &ModuleSlice, + SmallPtrSetImpl &Kernels) : InformationCache(M, AG, Allocator, CGSCC), ModuleSlice(ModuleSlice), - OMPBuilder(M) { + OMPBuilder(M), Kernels(Kernels) { OMPBuilder.initialize(); initializeRuntimeFunctions(); initializeInternalControlVars(); @@ -399,6 +404,9 @@ // TODO: We should attach the attributes defined in OMPKinds.def. } + + /// Collection of known kernels (\see Kernel) in the module. + SmallPtrSetImpl &Kernels; }; struct OpenMPOpt { @@ -423,26 +431,10 @@ << " functions in a slice with " << OMPInfoCache.ModuleSlice.size() << " functions\n"); - /// Print initial ICV values for testing. - /// FIXME: This should be done from the Attributor once it is added. - if (PrintICVValues) { - InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel}; - - for (Function *F : OMPInfoCache.ModuleSlice) { - for (auto ICV : ICVs) { - auto ICVInfo = OMPInfoCache.ICVs[ICV]; - auto Remark = [&](OptimizationRemark OR) { - return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name) - << " Value: " - << (ICVInfo.InitValue - ? ICVInfo.InitValue->getValue().toString(10, true) - : "IMPLEMENTATION_DEFINED"); - }; - - emitRemarkOnFunction(F, "OpenMPICVTracker", Remark); - } - } - } + if (PrintICVValues) + printICVs(); + if (PrintOpenMPKernels) + printKernels(); Changed |= runAttributor(); @@ -455,6 +447,42 @@ return Changed; } + /// Print initial ICV values for testing. + /// FIXME: This should be done from the Attributor once it is added. + void printICVs() const { + InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel}; + + for (Function *F : OMPInfoCache.ModuleSlice) { + for (auto ICV : ICVs) { + auto ICVInfo = OMPInfoCache.ICVs[ICV]; + auto Remark = [&](OptimizationRemark OR) { + return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name) + << " Value: " + << (ICVInfo.InitValue + ? ICVInfo.InitValue->getValue().toString(10, true) + : "IMPLEMENTATION_DEFINED"); + }; + + emitRemarkOnFunction(F, "OpenMPICVTracker", Remark); + } + } + } + + /// Print OpenMP GPU kernels for testing. + void printKernels() const { + for (Function *F : SCC) { + if (!OMPInfoCache.Kernels.count(F)) + continue; + + auto Remark = [&](OptimizationRemark OR) { + return OR << "OpenMP GPU kernel " + << ore::NV("OpenMPGPUKernel", F->getName()) << "\n"; + }; + + emitRemarkOnFunction(F, "OpenMPGPU", Remark); + } + } + /// Return the call if \p U is a callee use in a regular call. If \p RFI is /// given it has to be the callee or a nullptr is returned. static CallInst *getCallIfRegularCall( @@ -775,7 +803,7 @@ template > void emitRemark(Instruction *Inst, StringRef RemarkName, - RemarkCallBack &&RemarkCB) { + RemarkCallBack &&RemarkCB) const { Function *F = Inst->getParent()->getParent(); auto &ORE = OREGetter(F); @@ -785,9 +813,10 @@ /// Emit a remark on a function. Since only OptimizationRemark is supporting /// this, it can't be made generic. - void emitRemarkOnFunction( - Function *F, StringRef RemarkName, - function_ref &&RemarkCB) { + void + emitRemarkOnFunction(Function *F, StringRef RemarkName, + function_ref + &&RemarkCB) const { auto &ORE = OREGetter(F); ORE.emit([&]() { @@ -1044,7 +1073,8 @@ SetVector Functions(SCC.begin(), SCC.end()); BumpPtrAllocator Allocator; OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator, - /*CGSCC*/ &Functions, ModuleSlice); + /*CGSCC*/ &Functions, ModuleSlice, + OMPInModule.getKernels()); Attributor A(Functions, InfoCache, CGUpdater); @@ -1109,9 +1139,9 @@ AnalysisGetter AG; SetVector Functions(SCC.begin(), SCC.end()); BumpPtrAllocator Allocator; - OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, - Allocator, - /*CGSCC*/ &Functions, ModuleSlice); + OMPInformationCache InfoCache( + *(Functions.back()->getParent()), AG, Allocator, + /*CGSCC*/ &Functions, ModuleSlice, OMPInModule.getKernels()); Attributor A(Functions, InfoCache, CGUpdater); @@ -1125,14 +1155,45 @@ } // end anonymous namespace +void OpenMPInModule::identifyKernels(Module &M) { + + NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); + if (!MD) + return; + + for (auto *Op : MD->operands()) { + if (Op->getNumOperands() < 2) + continue; + MDString *KindID = dyn_cast(Op->getOperand(1)); + if (!KindID || KindID->getString() != "kernel") + continue; + + Function *KernelFn = + mdconst::dyn_extract_or_null(Op->getOperand(0)); + if (!KernelFn) + continue; + + ++NumOpenMPTargetRegionKernels; + + Kernels.insert(KernelFn); + } +} + bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) { if (OMPInModule.isKnown()) return OMPInModule; - #define OMP_RTL(_Enum, _Name, ...) \ - if (M.getFunction(_Name)) \ - return OMPInModule = true; + else if (M.getFunction(_Name)) OMPInModule = true; #include "llvm/Frontend/OpenMP/OMPKinds.def" + + // Identify kernels once. TODO: We should split the OMPInformationCache into a + // module and an SCC part. The kernel information, among other things, could + // go into the module part. + if (OMPInModule.isKnown() && OMPInModule) { + OMPInModule.identifyKernels(M); + return true; + } + return OMPInModule = false; } diff --git a/llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll b/llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll @@ -0,0 +1,27 @@ +; RUN: opt -passes=openmpopt -pass-remarks=openmp-opt -openmp-print-gpu-kernels -disable-output < %s 2>&1 | FileCheck %s --implicit-check-not=non_kernel +; RUN: opt -openmpopt -pass-remarks=openmp-opt -openmp-print-gpu-kernels -disable-output < %s 2>&1 | FileCheck %s --implicit-check-not=non_kernel + +; CHECK-DAG: remark: :0:0: OpenMP GPU kernel kernel1 +; CHECK-DAG: remark: :0:0: OpenMP GPU kernel kernel2 + +define void @kernel1() { + ret void +} + +define void @kernel2() { + ret void +} + +define void @non_kernel() { + ret void +} + +; Needed to trigger the openmp-opt pass +declare dso_local void @__kmpc_kernel_prepare_parallel(i8*) + +!nvvm.annotations = !{!2, !0, !1, !3, !1, !2} + +!0 = !{void ()* @kernel1, !"kernel", i32 1} +!1 = !{void ()* @non_kernel, !"non_kernel", i32 1} +!2 = !{null, !"align", i32 1} +!3 = !{void ()* @kernel2, !"kernel", i32 1}