diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -373,10 +373,22 @@ Expected findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const; + /// Get additional info for kernel, e.g., register spill counts + std::optional + getKernelInfo(StringRef Identifier) const { + auto It = KernelInfoMap.find(Identifier); + + if (It == KernelInfoMap.end()) + return {}; + + return It->second; + } + private: /// The exectuable loaded on the agent. hsa_executable_t Executable; hsa_code_object_t CodeObject; + StringMap KernelInfoMap; }; /// Class implementing the AMDGPU kernel functionalities which derives from the @@ -426,6 +438,12 @@ // TODO: Read the kernel descriptor for the max threads per block. May be // read from the image. + // Get additional kernel info read from image + KernelInfo = AMDImage.getKernelInfo(getName()); + if (!KernelInfo.has_value()) + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(), + "Could not read extra information for kernel %s.", getName()); + return Plugin::success(); } @@ -434,6 +452,11 @@ uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; + /// Print more elaborate kernel launch info for AMDGPU + Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, uint32_t NumThreads, + uint64_t NumBlocks) const override; + /// The default number of blocks is common to the whole device. uint32_t getDefaultNumBlocks(GenericDeviceTy &GenericDevice) const override { return GenericDevice.getDefaultNumBlocks(); @@ -462,6 +485,9 @@ /// The size of implicit kernel arguments. const uint32_t ImplicitArgsSize; + + /// Additional Info for the AMD GPU Kernel + std::optional KernelInfo; }; /// Class representing an HSA signal. Signals are used to define dependencies @@ -2200,6 +2226,10 @@ if (Result) return Plugin::error("Loaded HSA executable does not validate"); + if (auto Err = + utils::readAMDGPUMetaDataFromImage(getMemoryBuffer(), KernelInfoMap)) + return Err; + return Plugin::success(); } @@ -2571,6 +2601,50 @@ GroupSize, ArgsMemoryManager); } +Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, + uint32_t NumThreads, + uint64_t NumBlocks) const { + // Only do all this when the output is requested + if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL)) + return Plugin::success(); + + // We don't have data to print additional info, but no hard error + if (!KernelInfo.has_value()) + return Plugin::success(); + + // General Info + auto ConstWGSize = getDefaultNumThreads(GenericDevice); + auto NumGroups = NumBlocks; + auto ThreadsPerGroup = getDefaultNumThreads(GenericDevice); + auto NumTeams = KernelArgs.NumTeams[0]; // Only first dimension + auto ThreadLimit = KernelArgs.ThreadLimit[0]; // Only first dimension + + // Kernel Arguments Info + auto ArgNum = KernelArgs.NumArgs; + auto LoopTripCount = KernelArgs.Tripcount; + + // Details for AMDGPU kernels + auto GroupSegmentSize = (*KernelInfo).GroupSegmentList; + auto SGPRCount = (*KernelInfo).SGPRCount; + auto VGPRCount = (*KernelInfo).VGPRCount; + auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount; + auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount; + + // TODO set correctly once host services available + auto HostCallRequired = false; + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), + "SGN:%s ConstWGSize:%d args:%d teamsXthrds:(%4dX%4d) " + "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " + "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d n:%s\n", + getExecutionModeName(), ConstWGSize, ArgNum, NumGroups, ThreadsPerGroup, + NumTeams, ThreadLimit, GroupSegmentSize, SGPRCount, VGPRCount, + SGPRSpillCount, VGPRSpillCount, LoopTripCount, HostCallRequired, + getName()); + + return Plugin::success(); +} + GenericPluginTy *Plugin::createPlugin() { return new AMDGPUPluginTy(); } GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) { diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h --- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h @@ -17,6 +17,14 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" + +#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/BinaryFormat/MsgPackDocument.h" +#include "llvm/Support/MemoryBufferRef.h" + +#include "llvm/Support/YAMLTraits.h" namespace llvm { namespace omp { @@ -127,6 +135,166 @@ return true; } +struct KernelMetaDataTy { + uint64_t KernelObject; + uint32_t GroupSegmentList; + uint32_t PrivateSegmentSize; + uint32_t SGPRCount; + uint32_t VGPRCount; + uint32_t SGPRSpillCount; + uint32_t VGPRSpillCount; + uint32_t KernelSegmentSize; + uint32_t ExplicitArgumentCount; + uint32_t ImplicitArgumentCount; +}; +namespace { + +/// Reads the AMDGPU specific per-kernel-metadata from an image. +class KernelInfoReader { +public: + KernelInfoReader(StringMap &KIM) : KernelInfoMap(KIM) {} + + /// Process ELF note to read AMDGPU metadata from respective information + /// fields. + Error processNote(const object::ELF64LE::Note &Note) { + if (Note.getName() != "AMDGPU") + return Error::success(); // We are not interested in other things + + assert(Note.getType() == ELF::NT_AMDGPU_METADATA && + "Parse AMDGPU MetaData"); + auto Desc = Note.getDesc(); + StringRef MsgPackString = + StringRef(reinterpret_cast(Desc.data()), Desc.size()); + msgpack::Document MsgPackDoc; + if (!MsgPackDoc.readFromBlob(MsgPackString, /*Multi=*/false)) + return Error::success(); + + AMDGPU::HSAMD::V3::MetadataVerifier Verifier(true); + if (!Verifier.verify(MsgPackDoc.getRoot())) + return Error::success(); + + auto RootMap = MsgPackDoc.getRoot().getMap(true); + + if (auto Err = iterateAMDKernels(RootMap)) + return Err; + + return Error::success(); + } + +private: + /// Extracts the relevant information via simple string look-up in the msgpack + /// document elements. + Error extractKernelData(msgpack::MapDocNode::MapTy::value_type V, + std::string &KernelName, + KernelMetaDataTy &KernelData) { + if (!V.first.isString()) + return Error::success(); + + const auto isKey = [](const msgpack::DocNode &DK, StringRef SK) { + return DK.getString() == SK; + }; + + if (isKey(V.first, ".name")) { + KernelName = V.second.toString(); + } else if (isKey(V.first, ".sgpr_count")) { + KernelData.SGPRCount = V.second.getUInt(); + } else if (isKey(V.first, ".sgpr_spill_count")) { + KernelData.SGPRSpillCount = V.second.getUInt(); + } else if (isKey(V.first, ".vgpr_count")) { + KernelData.VGPRCount = V.second.getUInt(); + } else if (isKey(V.first, ".vgpr_spill_count")) { + KernelData.VGPRSpillCount = V.second.getUInt(); + } else if (isKey(V.first, ".private_segment_fixed_size")) { + KernelData.PrivateSegmentSize = V.second.getUInt(); + } else if (isKey(V.first, ".group_segement_fixed_size")) { + KernelData.GroupSegmentList = V.second.getUInt(); + } + + return Error::success(); + } + + /// Get the "amdhsa.kernels" element from the msgpack Document + Expected getAMDKernelsArray(msgpack::MapDocNode &MDN) { + auto Res = MDN.find("amdhsa.kernels"); + if (Res == MDN.end()) + return createStringError(inconvertibleErrorCode(), + "Could not find amdhsa.kernels key"); + + auto Pair = *Res; + assert(Pair.second.isArray() && + "AMDGPU kernel entries are arrays of entries"); + + return Pair.second.getArray(); + } + + /// Iterate all entries for one "amdhsa.kernels" entry. Each entry is a + /// MapDocNode that either maps a string to a single value (most of them) or + /// to another array of things. Currently, we only handle the case that maps + /// to scalar value. + Error generateKernelInfo(msgpack::ArrayDocNode::ArrayTy::iterator It) { + KernelMetaDataTy KernelData; + std::string KernelName; + auto Entry = (*It).getMap(); + for (auto MI = Entry.begin(), E = Entry.end(); MI != E; ++MI) + if (auto Err = extractKernelData(*MI, KernelName, KernelData)) + return Err; + + KernelInfoMap.insert({KernelName, KernelData}); + return Error::success(); + } + + /// Go over the list of AMD kernels in the "amdhsa.kernels" entry + Error iterateAMDKernels(msgpack::MapDocNode &MDN) { + auto KernelsOrErr = getAMDKernelsArray(MDN); + if (auto Err = KernelsOrErr.takeError()) + return Err; + + auto KernelsArr = *KernelsOrErr; + for (auto It = KernelsArr.begin(), E = KernelsArr.end(); It != E; ++It) { + if (!It->isMap()) + continue; // we expect pairs + + // Obtain the value for the different entries. Each array entry is a + // MapDocNode + if (auto Err = generateKernelInfo(It)) + return Err; + } + return Error::success(); + } + + // Kernel names are the keys + StringMap &KernelInfoMap; +}; +} // namespace + +/// Reads the AMDGPU specific metadata from the ELF file and propagates the +/// KernelInfoMap +Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer, + StringMap &KernelInfoMap) { + Error Err = Error::success(); // Used later as out-parameter + + auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer()); + if (auto Err = ELFOrError.takeError()) + return Err; + + const object::ELF64LEFile ELFObj = ELFOrError.get(); + ArrayRef Sections = cantFail(ELFObj.sections()); + KernelInfoReader Reader(KernelInfoMap); + for (const auto &S : Sections) { + if (S.sh_type != ELF::SHT_NOTE) + continue; + + for (const auto N : ELFObj.notes(S, Err)) { + if (Err) + return Err; + // Fills the KernelInfoTabel entries in the reader + if ((Err = Reader.processNote(N))) + return Err; + } + } + + return Error::success(); +} } // namespace utils } // namespace plugin } // namespace target diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -196,6 +196,32 @@ return false; } +protected: + /// Get the execution mode name of the kernel. + const char *getExecutionModeName() const { + switch (ExecutionMode) { + case OMP_TGT_EXEC_MODE_SPMD: + return "SPMD"; + case OMP_TGT_EXEC_MODE_GENERIC: + return "Generic"; + case OMP_TGT_EXEC_MODE_GENERIC_SPMD: + return "Generic-SPMD"; + } + llvm_unreachable("Unknown execution mode!"); + } + + /// Prints generic kernel launch information. + Error printLaunchInfo(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, uint32_t NumThreads, + uint64_t NumBlocks) const; + + /// Prints plugin-specific kernel launch information after generic kernel + /// launch information + virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, + uint32_t NumThreads, + uint64_t NumBlocks) const; + private: /// Prepare the arguments before launching the kernel. void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs, @@ -225,19 +251,6 @@ } bool isSPMDMode() const { return ExecutionMode == OMP_TGT_EXEC_MODE_SPMD; } - /// Get the execution mode name of the kernel. - const char *getExecutionModeName() const { - switch (ExecutionMode) { - case OMP_TGT_EXEC_MODE_SPMD: - return "SPMD"; - case OMP_TGT_EXEC_MODE_GENERIC: - return "Generic"; - case OMP_TGT_EXEC_MODE_GENERIC_SPMD: - return "Generic-SPMD"; - } - llvm_unreachable("Unknown execution mode!"); - } - /// The kernel name. const char *Name; diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -218,6 +218,25 @@ return initImpl(GenericDevice, Image); } +Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, + uint32_t NumThreads, + uint64_t NumBlocks) const { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), + "Launching kernel %s with %" PRIu64 + " blocks and %d threads in %s mode\n", + getName(), NumBlocks, NumThreads, getExecutionModeName()); + return printLaunchInfoDetails(GenericDevice, KernelArgs, NumThreads, + NumBlocks); +} + +Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, + uint32_t NumThreads, + uint64_t NumBlocks) const { + return Plugin::success(); +} + Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, AsyncInfoWrapperTy &AsyncInfoWrapper) const { @@ -232,10 +251,9 @@ uint64_t NumBlocks = getNumBlocks(GenericDevice, KernelArgs.NumTeams, KernelArgs.Tripcount, NumThreads); - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), - "Launching kernel %s with %" PRIu64 - " blocks and %d threads in %s mode\n", - getName(), NumBlocks, NumThreads, getExecutionModeName()); + if (auto Err = + printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks)) + return Err; return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs, KernelArgsPtr, AsyncInfoWrapper); diff --git a/openmp/libomptarget/test/offloading/info.c b/openmp/libomptarget/test/offloading/info.c --- a/openmp/libomptarget/test/offloading/info.c +++ b/openmp/libomptarget/test/offloading/info.c @@ -1,7 +1,9 @@ // RUN: %libomptarget-compile-generic \ // RUN: -gline-tables-only -fopenmp-extensions // RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | \ -// RUN: %fcheck-generic -allow-empty -check-prefix=INFO +// RUN: %fcheck-generic -allow-empty -check-prefixes=INFO +// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-amdgcn-amd-amdhsa 2>&1 | \ +// RUN: %fcheck-amdgcn-amd-amdhsa -allow-empty -check-prefixes=INFO,AMDGPU #include #include @@ -37,6 +39,7 @@ // INFO: info: Entering OpenMP kernel at info.c:{{[0-9]+}}:{{[0-9]+}} with 1 arguments: // INFO: info: firstprivate(val)[4] // INFO: info: Launching kernel __omp_offloading_{{.*}}main{{.*}} with {{[0-9]+}} blocks and {{[0-9]+}} threads in Generic mode +// AMDGPU: AMDGPU device {{[0-9]}} info: SGN:Generic ConstWGSize:{{[0-9]+}} args:{{[0-9]}} teamsXthrds:({{ [0-9]+}}X {{[0-9]+}}) reqd:( {{[0-9]+}}X {{[0-9]+}}) lds_usage:{{[0-9]+}}B sgpr_count:{{[0-9]+}} vgpr_count:{{[0-9]+}} sgpr_spill_count:{{[0-9]+}} vgpr_spill_count:{{[0-9]+}} tripcount:{{[0-9]+}} rpc:0 n:__omp_offloading_{{.*}}main{{.*}} // INFO: info: OpenMP Host-Device pointer mappings after block at info.c:{{[0-9]+}}:{{[0-9]+}}: // INFO: info: Host Ptr Target Ptr Size (B) DynRefCount HoldRefCount Declaration // INFO: info: {{.*}} {{.*}} 256 1 0 C[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}}