diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt --- a/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt @@ -68,6 +68,8 @@ add_llvm_library(omptarget.rtl.amdgpu.nextgen SHARED src/rtl.cpp + utils/UtilitiesHSA.cpp + utils/msgpack.cpp ${LIBOMPTARGET_EXTRA_SOURCE} ADDITIONAL_HEADER_DIRS diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -25,6 +25,7 @@ #include "GlobalHandler.h" #include "PluginInterface.h" #include "Utilities.h" +#include "UtilitiesHSA.h" #include "UtilitiesRTL.h" #include "omptarget.h" @@ -373,10 +374,19 @@ Expected findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const; + KernelMetaDataTy getKernelInfo(const std::string &Identifier) const { + return KernelInfoMap.at(Identifier); + } + SymbolInfoTy getSymbolinfo(const std::string &Identifier) const { + return SymbolInfoMap.at(Identifier); + } + private: /// The exectuable loaded on the agent. hsa_executable_t Executable; hsa_code_object_t CodeObject; + std::map KernelInfoMap; + std::map SymbolInfoMap; }; /// Class implementing the AMDGPU kernel functionalities which derives from the @@ -426,6 +436,9 @@ // TODO: Read the kernel descriptor for the max threads per block. May be // read from the image. + // Read additional kernel meta info from image + KernelInfo = AMDImage.getKernelInfo(getName()); + return Plugin::success(); } @@ -434,6 +447,11 @@ uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; + /// Print more elaborate kernel launch info for AMDGPU + Error printLaunchInfo(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, uint32_t NumThreads, + uint64_t NumBlocks) const; + /// The default number of blocks is common to the whole device. uint32_t getDefaultNumBlocks(GenericDeviceTy &GenericDevice) const override { return GenericDevice.getDefaultNumBlocks(); @@ -462,6 +480,9 @@ /// The size of implicit kernel arguments. const uint32_t ImplicitArgsSize; + + /// Additional Info for the AMD GPU Kernel + KernelMetaDataTy KernelInfo; }; /// Class representing an HSA signal. Signals are used to define dependencies @@ -2200,6 +2221,21 @@ if (Result) return Plugin::error("Loaded HSA executable does not validate"); + // Read the additional info from the image + std::vector Executables{Executable}; + Status = utils::core::moduleRegisterFromMemoryToPlace( + KernelInfoMap, SymbolInfoMap, getStart(), getSize(), Device.getAgent(), + [&](void *Data, size_t Size) { + // Currently we do nothing here. Can be used later for host service to + // determine if host services are needed for this kernel. + return HSA_STATUS_SUCCESS; + }, + Executables); + + if (auto Err = Plugin::check( + Status, "Error in reading symbol and metadata from image")) + return Err; + return Plugin::success(); } @@ -2571,6 +2607,44 @@ GroupSize, ArgsMemoryManager); } +Error AMDGPUKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, + uint32_t NumThreads, + uint64_t NumBlocks) const { + // Only do all this when the output is requested + if (getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL) { + // General Info + auto DeviceId = GenericDevice.getDeviceId(); + auto ConstWGSize = getDefaultNumThreads(GenericDevice); + auto NumGroups = NumBlocks; + auto ThreadsPerGroup = getDefaultNumThreads(GenericDevice); + auto NumTeams = KernelArgs.NumTeams[0]; // Only first dimension + auto ThreadLimit = KernelArgs.ThreadLimit[0]; // Only first dimension + + // Kernel Arguments Info + auto ArgNum = KernelArgs.NumArgs; + auto LoopTripCount = KernelArgs.Tripcount; + + // Custom KernelType info from AMDGPU reader + auto GroupSegmentSize = KernelInfo.group_segment_size; + auto SgprCount = KernelInfo.sgpr_count; + auto VgprCount = KernelInfo.vgpr_count; + auto SgprSpillCount = KernelInfo.sgpr_spill_count; + auto VgprSpillCount = KernelInfo.vgpr_spill_count; + + auto HostCallRequired = false; + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), + "DEVID:%2d SGN:%s ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " + "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " + "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d n:%s\n", + DeviceId, getExecutionModeName(), ConstWGSize, ArgNum, NumGroups, + ThreadsPerGroup, NumTeams, ThreadLimit, GroupSegmentSize, SgprCount, + VgprCount, SgprSpillCount, VgprSpillCount, LoopTripCount, + HostCallRequired, getName()); + } + return Plugin::success(); +} + GenericPluginTy *Plugin::createPlugin() { return new AMDGPUPluginTy(); } GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) { diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesHSA.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesHSA.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesHSA.h @@ -0,0 +1,86 @@ +//===----RTLs/amdgpu/utils/UtilitiesRTL.h ------------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// HSA Utilities for AMDGPU plugins +// Allows inspection of AMDGPU specfic metadata in AMDGPU images +// +//===----------------------------------------------------------------------===// + +#include "llvm/Object/ELF.h" + +#include +#include +#include + +#include "hsa.h" + +struct KernelMetaDataTy { + uint64_t kernel_object; + uint32_t group_segment_size; + uint32_t private_segment_size; + uint32_t sgpr_count; + uint32_t vgpr_count; + uint32_t sgpr_spill_count; + uint32_t vgpr_spill_count; + uint32_t kernel_segment_size; + uint32_t explicit_argument_count; + uint32_t implicit_argument_count; + std::string kind; +}; + +struct SymbolInfoTy { + uint64_t addr; + uint32_t size; +}; + +struct SymbolInfo { + const void *Addr = nullptr; + uint32_t Size = UINT32_MAX; + uint32_t ShType = llvm::ELF::SHT_NULL; +}; + +bool imageContainsSymbol(void *Data, size_t Size, const char *Sym); +int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *SymName, + SymbolInfo *Res); + +namespace llvm { +namespace omp { +namespace target { +namespace plugin { +namespace utils { + +namespace core { +hsa_status_t registerModuleFromMemory( + std::map &KernelInfoTable, + std::map &SymbolInfoTable, void *module_bytes, + size_t module_size, hsa_agent_t agent, + hsa_status_t (*on_deserialized_data)(void *data, size_t size, + void *cb_state), + void *cb_state, std::vector &HSAExecutables); + +template +hsa_status_t moduleRegisterFromMemoryToPlace( + std::map &KernelInfoTable, + std::map &SymbolInfoTable, void *ModuleBytes, + size_t ModuleSize, hsa_agent_t agent, C Cb, + std::vector &HSAExecutables) { + auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t { + C *Unwrapped = static_cast(CbState); + return (*Unwrapped)(Data, Size); + }; + return core::registerModuleFromMemory( + KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize, agent, L, + static_cast(&Cb), HSAExecutables); +} +} // namespace core + +} // namespace utils +} // namespace plugin +} // namespace target +} // namespace omp +} // namespace llvm \ No newline at end of file diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesHSA.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesHSA.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesHSA.cpp @@ -0,0 +1,858 @@ +#include "UtilitiesHSA.h" + +#include "ELFSymbols.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/Object/ELF.h" +#include "llvm/Object/ELFObjectFile.h" + +#include +#include + +#include "hsa_ext_amd.h" + +#include "msgpack.h" + +#ifndef TARGET_NAME +#define TARGET_NAME AMDGPU +#endif +#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" +#include "Debug.h" + +using namespace llvm; +using namespace llvm::object; +using namespace llvm::ELF; + +inline const char *get_error_string(hsa_status_t err) { + const char *res; + hsa_status_t rc = hsa_status_string(err, &res); + return (rc == HSA_STATUS_SUCCESS) ? res : "HSA_STATUS UNKNOWN."; +} + +namespace hsa { +// Wrap HSA iterate API in a shim that allows passing general callables +template +hsa_status_t executable_iterate_symbols(hsa_executable_t executable, C cb) { + auto L = [](hsa_executable_t executable, hsa_executable_symbol_t symbol, + void *data) -> hsa_status_t { + C *unwrapped = static_cast(data); + return (*unwrapped)(executable, symbol); + }; + return hsa_executable_iterate_symbols(executable, L, + static_cast(&cb)); +} +} // namespace hsa + +typedef unsigned char *address; +/* + * Note descriptors. + */ +// FreeBSD already declares Elf_Note (indirectly via ) +#if !defined(__FreeBSD__) +typedef struct { + uint32_t n_namesz; /* Length of note's name. */ + uint32_t n_descsz; /* Length of note's value. */ + uint32_t n_type; /* Type of note. */ + // then name + // then padding, optional + // then desc, at 4 byte alignment (not 8, despite being elf64) +} Elf_Note; +#endif + +class KernelArgMD { +public: + enum class ValueKind { + HiddenGlobalOffsetX, + HiddenGlobalOffsetY, + HiddenGlobalOffsetZ, + HiddenNone, + HiddenPrintfBuffer, + HiddenDefaultQueue, + HiddenCompletionAction, + HiddenMultiGridSyncArg, + HiddenHostcallBuffer, + HiddenHeapV1, + HiddenBlockCountX, + HiddenBlockCountY, + HiddenBlockCountZ, + HiddenGroupSizeX, + HiddenGroupSizeY, + HiddenGroupSizeZ, + HiddenRemainderX, + HiddenRemainderY, + HiddenRemainderZ, + HiddenGridDims, + HiddenQueuePtr, + Unknown + }; + + KernelArgMD() + : name_(std::string()), size_(0), offset_(0), + valueKind_(ValueKind::Unknown) {} + + // fields + std::string name_; + uint32_t size_; + uint32_t offset_; + ValueKind valueKind_; +}; + +static const std::map ArgValueKind = { + // v3 + // {"by_value", KernelArgMD::ValueKind::ByValue}, + // {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer}, + // {"dynamic_shared_pointer", + // KernelArgMD::ValueKind::DynamicSharedPointer}, + // {"sampler", KernelArgMD::ValueKind::Sampler}, + // {"image", KernelArgMD::ValueKind::Image}, + // {"pipe", KernelArgMD::ValueKind::Pipe}, + // {"queue", KernelArgMD::ValueKind::Queue}, + {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX}, + {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY}, + {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ}, + {"hidden_none", KernelArgMD::ValueKind::HiddenNone}, + {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer}, + {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue}, + {"hidden_completion_action", + KernelArgMD::ValueKind::HiddenCompletionAction}, + {"hidden_multigrid_sync_arg", + KernelArgMD::ValueKind::HiddenMultiGridSyncArg}, + {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer}, + {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}, + {"hidden_block_count_x", KernelArgMD::ValueKind::HiddenBlockCountX}, + {"hidden_block_count_y", KernelArgMD::ValueKind::HiddenBlockCountY}, + {"hidden_block_count_z", KernelArgMD::ValueKind::HiddenBlockCountZ}, + {"hidden_group_size_x", KernelArgMD::ValueKind::HiddenGroupSizeX}, + {"hidden_group_size_y", KernelArgMD::ValueKind::HiddenGroupSizeY}, + {"hidden_group_size_z", KernelArgMD::ValueKind::HiddenGroupSizeZ}, + {"hidden_remainder_x", KernelArgMD::ValueKind::HiddenRemainderX}, + {"hidden_remainder_y", KernelArgMD::ValueKind::HiddenRemainderY}, + {"hidden_remainder_z", KernelArgMD::ValueKind::HiddenRemainderZ}, + {"hidden_grid_dims", KernelArgMD::ValueKind::HiddenGridDims}, + {"hidden_queue_ptr", KernelArgMD::ValueKind::HiddenQueuePtr}, +}; + +namespace llvm::omp::target::plugin::utils::core { + +// namespace core { + +hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) { + if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) { + hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault; + // memory_fault.agent + // memory_fault.virtual_address + // memory_fault.fault_reason_mask + // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address); + std::stringstream stream; + stream << std::hex << (uintptr_t)memory_fault.virtual_address; + std::string addr("0x" + stream.str()); + + std::string err_string = "[GPU Memory Error] Addr: " + addr; + err_string += " Reason: "; + if (!(memory_fault.fault_reason_mask & 0x00111111)) { + err_string += "No Idea! "; + } else { + if (memory_fault.fault_reason_mask & 0x00000001) + err_string += "Page not present or supervisor privilege. "; + if (memory_fault.fault_reason_mask & 0x00000010) + err_string += "Write access to a read-only page. "; + if (memory_fault.fault_reason_mask & 0x00000100) + err_string += "Execute access to a page marked NX. "; + if (memory_fault.fault_reason_mask & 0x00001000) + err_string += "Host access only. "; + if (memory_fault.fault_reason_mask & 0x00010000) + err_string += "ECC failure (if supported by HW). "; + if (memory_fault.fault_reason_mask & 0x00100000) + err_string += "Can't determine the exact fault address. "; + } + fprintf(stderr, "%s\n", err_string.c_str()); + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t atl_init_gpu_context() { + hsa_status_t err = hsa_amd_register_system_event_handler(callbackEvent, NULL); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Registering the system for memory faults", get_error_string(err)); + return HSA_STATUS_ERROR; + } + + return HSA_STATUS_SUCCESS; +} + +static bool isImplicit(KernelArgMD::ValueKind value_kind) { + switch (value_kind) { + case KernelArgMD::ValueKind::HiddenGlobalOffsetX: + case KernelArgMD::ValueKind::HiddenGlobalOffsetY: + case KernelArgMD::ValueKind::HiddenGlobalOffsetZ: + case KernelArgMD::ValueKind::HiddenNone: + case KernelArgMD::ValueKind::HiddenPrintfBuffer: + case KernelArgMD::ValueKind::HiddenDefaultQueue: + case KernelArgMD::ValueKind::HiddenCompletionAction: + case KernelArgMD::ValueKind::HiddenMultiGridSyncArg: + case KernelArgMD::ValueKind::HiddenHostcallBuffer: + case KernelArgMD::ValueKind::HiddenHeapV1: + case KernelArgMD::ValueKind::HiddenBlockCountX: + case KernelArgMD::ValueKind::HiddenBlockCountY: + case KernelArgMD::ValueKind::HiddenBlockCountZ: + case KernelArgMD::ValueKind::HiddenGroupSizeX: + case KernelArgMD::ValueKind::HiddenGroupSizeY: + case KernelArgMD::ValueKind::HiddenGroupSizeZ: + case KernelArgMD::ValueKind::HiddenRemainderX: + case KernelArgMD::ValueKind::HiddenRemainderY: + case KernelArgMD::ValueKind::HiddenRemainderZ: + case KernelArgMD::ValueKind::HiddenGridDims: + case KernelArgMD::ValueKind::HiddenQueuePtr: + return true; + default: + return false; + } +} + +static std::pair +findMetadata(const ELFObjectFile &ELFObj) { + constexpr std::pair Failure = { + nullptr, nullptr}; + const auto &Elf = ELFObj.getELFFile(); + auto PhdrsOrErr = Elf.program_headers(); + if (!PhdrsOrErr) { + consumeError(PhdrsOrErr.takeError()); + return Failure; + } + + for (auto Phdr : *PhdrsOrErr) { + if (Phdr.p_type != PT_NOTE) + continue; + + Error Err = Error::success(); + for (auto Note : Elf.notes(Phdr, Err)) { + if (Note.getType() == 7 || Note.getType() == 8) + return Failure; + + // Code object v2 uses yaml metadata and is no longer supported. + if (Note.getType() == NT_AMD_HSA_METADATA && Note.getName() == "AMD") + return Failure; + // Code object v3 should have AMDGPU metadata. + if (Note.getType() == NT_AMDGPU_METADATA && Note.getName() != "AMDGPU") + return Failure; + + ArrayRef Desc = Note.getDesc(); + return {Desc.data(), Desc.data() + Desc.size()}; + } + + if (Err) { + consumeError(std::move(Err)); + return Failure; + } + } + + return Failure; +} + +static std::pair +find_metadata(void *binary, size_t binSize) { + constexpr std::pair Failure = { + nullptr, nullptr}; + + StringRef Buffer = StringRef(static_cast(binary), binSize); + auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), + /*InitContent=*/false); + if (!ElfOrErr) { + consumeError(ElfOrErr.takeError()); + return Failure; + } + + if (const auto *ELFObj = dyn_cast(ElfOrErr->get())) + return findMetadata(*ELFObj); + return Failure; +} + +namespace { +int map_lookup_array(msgpack::byte_range message, const char *needle, + msgpack::byte_range *res, uint64_t *size) { + unsigned count = 0; + struct s : msgpack::functors_defaults { + s(unsigned &count, uint64_t *size) : count(count), size(size) {} + unsigned &count; + uint64_t *size; + const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) { + count++; + *size = N; + return bytes.end; + } + }; + + msgpack::foreach_map(message, + [&](msgpack::byte_range key, msgpack::byte_range value) { + if (msgpack::message_is_string(key, needle)) { + // If the message is an array, record number of + // elements in *size + msgpack::handle_msgpack(value, {count, size}); + // return the whole array + *res = value; + } + }); + // Only claim success if exactly one key/array pair matched + return count != 1; +} + +int map_lookup_string(msgpack::byte_range message, const char *needle, + std::string *res) { + unsigned count = 0; + struct s : public msgpack::functors_defaults { + s(unsigned &count, std::string *res) : count(count), res(res) {} + unsigned &count; + std::string *res; + void handle_string(size_t N, const unsigned char *str) { + count++; + *res = std::string(str, str + N); + } + }; + msgpack::foreach_map(message, + [&](msgpack::byte_range key, msgpack::byte_range value) { + if (msgpack::message_is_string(key, needle)) { + msgpack::handle_msgpack(value, {count, res}); + } + }); + return count != 1; +} + +int map_lookup_uint64_t(msgpack::byte_range message, const char *needle, + uint64_t *res) { + unsigned count = 0; + msgpack::foreach_map(message, + [&](msgpack::byte_range key, msgpack::byte_range value) { + if (msgpack::message_is_string(key, needle)) { + msgpack::foronly_unsigned(value, [&](uint64_t x) { + count++; + *res = x; + }); + } + }); + return count != 1; +} + +int array_lookup_element(msgpack::byte_range message, uint64_t elt, + msgpack::byte_range *res) { + int rc = 1; + uint64_t i = 0; + msgpack::foreach_array(message, [&](msgpack::byte_range value) { + if (i == elt) { + *res = value; + rc = 0; + } + i++; + }); + return rc; +} + +int populateKernelArgMD(msgpack::byte_range args_element, + KernelArgMD *kernelarg) { + using namespace msgpack; + int error = 0; + foreach_map(args_element, [&](byte_range key, byte_range value) -> void { + if (message_is_string(key, ".name")) { + foronly_string(value, [&](size_t N, const unsigned char *str) { + kernelarg->name_ = std::string(str, str + N); + }); + } else if (message_is_string(key, ".size")) { + foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; }); + } else if (message_is_string(key, ".offset")) { + foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; }); + } else if (message_is_string(key, ".value_kind")) { + foronly_string(value, [&](size_t N, const unsigned char *str) { + std::string s = std::string(str, str + N); + auto itValueKind = ArgValueKind.find(s); + if (itValueKind != ArgValueKind.end()) { + kernelarg->valueKind_ = itValueKind->second; + } + }); + } + }); + return error; +} +} // namespace + +static hsa_status_t get_code_object_custom_metadata( + void *binary, size_t binSize, + std::map &KernelInfoTable) { + // parse code object with different keys from v2 + // also, the kernel name is not the same as the symbol name -- so a + // symbol->name map is needed + + std::pair metadata = + find_metadata(binary, binSize); + if (!metadata.first) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + uint64_t kernelsSize = 0; + int msgpack_errors = 0; + msgpack::byte_range kernel_array; + msgpack_errors = + map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels", + &kernel_array, &kernelsSize); + + if (msgpack_errors != 0) { + printf("[%s:%d] %s failed\n", __FILE__, __LINE__, + "kernels lookup in program metadata"); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + for (size_t i = 0; i < kernelsSize; i++) { + assert(msgpack_errors == 0); + std::string kernelName; + std::string symbolName; + std::string kernelKind; + + msgpack::byte_range element; + msgpack_errors += array_lookup_element(kernel_array, i, &element); + if (msgpack_errors != 0) { + printf("[%s:%d] %s failed\n", __FILE__, __LINE__, + "element lookup in kernel metadata"); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + msgpack_errors += map_lookup_string(element, ".name", &kernelName); + msgpack_errors += map_lookup_string(element, ".symbol", &symbolName); + if (msgpack_errors != 0) { + printf("[%s:%d] %s failed\n", __FILE__, __LINE__, + "strings lookup in kernel metadata"); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + // Make sure that kernelName + ".kd" == symbolName + if ((kernelName + ".kd") != symbolName) { + printf("[%s:%d] Kernel name mismatching symbol: %s != %s + .kd\n", + __FILE__, __LINE__, symbolName.c_str(), kernelName.c_str()); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + KernelMetaDataTy info = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "normal"}; + + uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count; + + if (symbolName == "amdgcn.device.init.kd" || + symbolName == "amdgcn.device.fini.kd") { + msgpack_errors += map_lookup_string(element, ".kind", &kernelKind); + if (msgpack_errors != 0) { + printf("[%s:%d] %s failed\n", __FILE__, __LINE__, + "kind metadata lookup in kernel metadata"); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + info.kind = kernelKind; + } + + msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count); + if (msgpack_errors != 0) { + printf("[%s:%d] %s failed\n", __FILE__, __LINE__, + "sgpr count metadata lookup in kernel metadata"); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + info.sgpr_count = sgpr_count; + + msgpack_errors += map_lookup_uint64_t(element, ".vgpr_count", &vgpr_count); + if (msgpack_errors != 0) { + printf("[%s:%d] %s failed\n", __FILE__, __LINE__, + "vgpr count metadata lookup in kernel metadata"); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + info.vgpr_count = vgpr_count; + + msgpack_errors += + map_lookup_uint64_t(element, ".sgpr_spill_count", &sgpr_spill_count); + if (msgpack_errors != 0) { + printf("[%s:%d] %s failed\n", __FILE__, __LINE__, + "sgpr spill count metadata lookup in kernel metadata"); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + info.sgpr_spill_count = sgpr_spill_count; + + msgpack_errors += + map_lookup_uint64_t(element, ".vgpr_spill_count", &vgpr_spill_count); + if (msgpack_errors != 0) { + printf("[%s:%d] %s failed\n", __FILE__, __LINE__, + "vgpr spill count metadata lookup in kernel metadata"); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + info.vgpr_spill_count = vgpr_spill_count; + + size_t kernel_explicit_args_size = 0; + uint64_t kernel_segment_size; + msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size", + &kernel_segment_size); + if (msgpack_errors != 0) { + printf("[%s:%d] %s failed\n", __FILE__, __LINE__, + "kernarg segment size metadata lookup in kernel metadata"); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + bool hasHiddenArgs = false; + if (kernel_segment_size > 0) { + uint64_t argsSize; + size_t offset = 0; + + msgpack::byte_range args_array; + msgpack_errors += + map_lookup_array(element, ".args", &args_array, &argsSize); + if (msgpack_errors != 0) { + printf("[%s:%d] %s failed\n", __FILE__, __LINE__, + "kernel args metadata lookup in kernel metadata"); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + for (size_t i = 0; i < argsSize; ++i) { + KernelArgMD lcArg; + + msgpack::byte_range args_element; + msgpack_errors += array_lookup_element(args_array, i, &args_element); + if (msgpack_errors != 0) { + printf("[%s:%d] %s failed\n", __FILE__, __LINE__, + "iterate args map in kernel args metadata"); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + msgpack_errors += populateKernelArgMD(args_element, &lcArg); + if (msgpack_errors != 0) { + printf("[%s:%d] %s failed\n", __FILE__, __LINE__, + "iterate args map in kernel args metadata"); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + // v3 has offset field and not align field + size_t new_offset = lcArg.offset_; + size_t padding = new_offset - offset; + offset = new_offset; + + offset += lcArg.size_; + + // check if the arg is a hidden/implicit arg + // this logic assumes that all hidden args are 8-byte aligned + if (!isImplicit(lcArg.valueKind_)) { + info.explicit_argument_count++; + kernel_explicit_args_size += lcArg.size_; + DP("Explicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i, + lcArg.name_.c_str(), lcArg.size_, lcArg.offset_); + } else { + info.implicit_argument_count++; + hasHiddenArgs = true; + DP("Implicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i, + lcArg.name_.c_str(), lcArg.size_, lcArg.offset_); + } + kernel_explicit_args_size += padding; + } + } + + // TODO: Probably don't want this arithmetic + info.kernel_segment_size = + (!hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size); + DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(), + kernel_segment_size, info.kernel_segment_size); + + // kernel received, now add it to the kernel info table + KernelInfoTable[kernelName] = info; + } + + return HSA_STATUS_SUCCESS; +} + +static hsa_status_t +populateInfoTables(hsa_executable_symbol_t symbol, + std::map &KernelInfoTable, + std::map &SymbolInfoTable) { + hsa_symbol_kind_t type; + + uint32_t name_length; + hsa_status_t err; + err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, + &type); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Symbol info extraction", get_error_string(err)); + return err; + } + DP("Exec Symbol type: %d\n", type); + if (type == HSA_SYMBOL_KIND_KERNEL) { + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Symbol info extraction", get_error_string(err)); + return err; + } + char *name = reinterpret_cast(malloc(name_length + 1)); + err = hsa_executable_symbol_get_info(symbol, + HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Symbol info extraction", get_error_string(err)); + return err; + } + // remove the suffix .kd from symbol name. + name[name_length - 3] = 0; + + KernelMetaDataTy info; + std::string kernelName(name); + // by now, the kernel info table should already have an entry + // because the non-ROCr custom code object parsing is called before + // iterating over the code object symbols using ROCr + if (KernelInfoTable.find(kernelName) == KernelInfoTable.end()) { + DP("amdgpu internal consistency error\n"); + return HSA_STATUS_ERROR; + } + // found, so assign and update + info = KernelInfoTable[kernelName]; + + /* Extract dispatch information from the symbol */ + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &(info.kernel_object)); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Extracting the symbol from the executable", + get_error_string(err)); + return err; + } + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, + &(info.group_segment_size)); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Extracting the group segment size from the executable", + get_error_string(err)); + return err; + } + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, + &(info.private_segment_size)); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Extracting the private segment from the executable", + get_error_string(err)); + return err; + } + + DP("Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes " + "kernarg\n", + kernelName.c_str(), info.kernel_object, info.group_segment_size, + info.private_segment_size, info.kernel_segment_size); + + // assign it back to the kernel info table + KernelInfoTable[kernelName] = info; + free(name); + } else if (type == HSA_SYMBOL_KIND_VARIABLE) { + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Symbol info extraction", get_error_string(err)); + return err; + } + char *name = reinterpret_cast(malloc(name_length + 1)); + err = hsa_executable_symbol_get_info(symbol, + HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Symbol info extraction", get_error_string(err)); + return err; + } + name[name_length] = 0; + + SymbolInfoTy info; + + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr)); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Symbol info address extraction", get_error_string(err)); + return err; + } + + err = hsa_executable_symbol_get_info( + symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size)); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Symbol info size extraction", get_error_string(err)); + return err; + } + + DP("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr, info.size); + SymbolInfoTable[std::string(name)] = info; + free(name); + } else { + DP("Symbol is an indirect function\n"); + } + return HSA_STATUS_SUCCESS; +} + +//} // namespace core + +hsa_status_t registerModuleFromMemory( + std::map &KernelInfoTable, + std::map &SymbolInfoTable, void *module_bytes, + size_t module_size, hsa_agent_t agent, + hsa_status_t (*on_deserialized_data)(void *data, size_t size, + void *cb_state), + void *cb_state, std::vector &HSAExecutables) { + hsa_status_t err; + hsa_executable_t executable = {0}; + hsa_code_object_reader_t code_object_rdr; + + hsa_profile_t agent_profile; + + err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Query the agent profile", get_error_string(err)); + return HSA_STATUS_ERROR; + } + // FIXME: Assume that every profile is FULL until we understand how to build + // GCN with base profile + agent_profile = HSA_PROFILE_FULL; + /* Create the empty executable. */ + err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "", + &executable); + + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Create the executable", get_error_string(err)); + return HSA_STATUS_ERROR; + } + + err = hsa_code_object_reader_create_from_memory(module_bytes, module_size, + &code_object_rdr); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Error in creating code object reader from memory!", + get_error_string(err)); + return err; + } + + bool module_load_success = false; + do // Existing control flow used continue, preserve that for this patch + { + { + // Some metadata info is not available through ROCr API, so use custom + // code object metadata parsing to collect such metadata info + + err = get_code_object_custom_metadata(module_bytes, module_size, + KernelInfoTable); + if (err != HSA_STATUS_SUCCESS) { + DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Getting custom code object metadata", get_error_string(err)); + continue; + } + // Deserialize code object. + hsa_code_object_t code_object = {0}; + err = hsa_code_object_deserialize(module_bytes, module_size, NULL, + &code_object); + if (err != HSA_STATUS_SUCCESS) { + DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Code Object Deserialization", get_error_string(err)); + continue; + } + assert(0 != code_object.handle); + + // Mutating the device image here avoids another allocation & memcpy + void *code_object_alloc_data = + reinterpret_cast(code_object.handle); + hsa_status_t impl_err = + on_deserialized_data(code_object_alloc_data, module_size, cb_state); + if (impl_err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Error in deserialized_data callback", + get_error_string(impl_err)); + return impl_err; + } + /* Load the code object. */ + err = hsa_executable_load_agent_code_object(executable, agent, + code_object_rdr, NULL, NULL); + if (err != HSA_STATUS_SUCCESS) { + DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Loading the code object", get_error_string(err)); + continue; + } + + // cannot iterate over symbols until executable is frozen + } + module_load_success = true; + } while (0); + DP("Modules loaded successful? %d\n", module_load_success); + if (module_load_success) { + /* Freeze the executable; it can now be queried for symbols. */ + err = hsa_executable_freeze(executable, ""); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Freeze the executable", get_error_string(err)); + return HSA_STATUS_ERROR; + } + + err = hsa::executable_iterate_symbols( + executable, + [&](hsa_executable_t, hsa_executable_symbol_t symbol) -> hsa_status_t { + return populateInfoTables(symbol, KernelInfoTable, SymbolInfoTable); + }); + if (err != HSA_STATUS_SUCCESS) { + printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Iterating over symbols for execuatable", get_error_string(err)); + return HSA_STATUS_ERROR; + } + + // save the executable and destroy during finalize + HSAExecutables.push_back(executable); + return HSA_STATUS_SUCCESS; + } else { + return HSA_STATUS_ERROR; + } +} +} // namespace llvm::omp::target::plugin::utils::core + +int getSymbolInfoWithoutLoading(const ELFObjectFile &ELFObj, + StringRef SymName, SymbolInfo *Res) { + auto SymOrErr = getELFSymbol(ELFObj, SymName); + if (!SymOrErr) { + std::string ErrorString = toString(SymOrErr.takeError()); + DP("Failed ELF lookup: %s\n", ErrorString.c_str()); + return 1; + } + if (!*SymOrErr) + return 1; + + auto SymSecOrErr = ELFObj.getELFFile().getSection((*SymOrErr)->st_shndx); + if (!SymSecOrErr) { + std::string ErrorString = toString(SymOrErr.takeError()); + DP("Failed ELF lookup: %s\n", ErrorString.c_str()); + return 1; + } + + Res->Addr = (*SymOrErr)->st_value + ELFObj.getELFFile().base(); + Res->Size = static_cast((*SymOrErr)->st_size); + Res->ShType = static_cast((*SymSecOrErr)->sh_type); + return 0; +} + +int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *SymName, + SymbolInfo *Res) { + StringRef Buffer = StringRef(Base, ImgSize); + auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), + /*InitContent=*/false); + if (!ElfOrErr) { + REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str()); + return 1; + } + + if (const auto *ELFObj = dyn_cast(ElfOrErr->get())) + return getSymbolInfoWithoutLoading(*ELFObj, SymName, Res); + return 1; +} + +bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) { + SymbolInfo SI; + int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI); + return (Rc == 0) && (SI.Addr != nullptr); +} \ No newline at end of file diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/msgpack.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/msgpack.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/msgpack.h @@ -0,0 +1,282 @@ +//===--- amdgpu/impl/msgpack.h ------------------------------------ C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef MSGPACK_H +#define MSGPACK_H + +#include + +namespace msgpack { + +// The message pack format is dynamically typed, schema-less. Format is: +// message: [type][header][payload] +// where type is one byte, header length is a fixed length function of type +// payload is zero to N bytes, with the length encoded in [type][header] + +// Scalar fields include boolean, signed integer, float, string etc +// Composite types are sequences of messages +// Array field is [header][element][element]... +// Map field is [header][key][value][key][value]... + +// Multibyte integer fields are big endian encoded +// The map key can be any message type +// Maps may contain duplicate keys +// Data is not uniquely encoded, e.g. integer "8" may be stored as one byte or +// in as many as nine, as signed or unsigned. Implementation defined. +// Similarly "foo" may embed the length in the type field or in multiple bytes + +// This parser is structured as an iterator over a sequence of bytes. +// It calls a user provided function on each message in order to extract fields +// The default implementation for each scalar type is to do nothing. For map or +// arrays, the default implementation returns just after that message to support +// iterating to the next message, but otherwise has no effect. + +struct byte_range { + const unsigned char *start; + const unsigned char *end; +}; + +const unsigned char *skip_next_message(const unsigned char *start, + const unsigned char *end); + +template class functors_defaults { +public: + void cb_string(size_t N, const unsigned char *str) { + derived().handle_string(N, str); + } + void cb_boolean(bool x) { derived().handle_boolean(x); } + void cb_signed(int64_t x) { derived().handle_signed(x); } + void cb_unsigned(uint64_t x) { derived().handle_unsigned(x); } + void cb_array_elements(byte_range bytes) { + derived().handle_array_elements(bytes); + } + void cb_map_elements(byte_range key, byte_range value) { + derived().handle_map_elements(key, value); + } + const unsigned char *cb_array(uint64_t N, byte_range bytes) { + return derived().handle_array(N, bytes); + } + const unsigned char *cb_map(uint64_t N, byte_range bytes) { + return derived().handle_map(N, bytes); + } + +private: + Derived &derived() { return *static_cast(this); } + + // Default implementations for scalar ops are no-ops + void handle_string(size_t, const unsigned char *) {} + void handle_boolean(bool) {} + void handle_signed(int64_t) {} + void handle_unsigned(uint64_t) {} + void handle_array_elements(byte_range) {} + void handle_map_elements(byte_range, byte_range) {} + + // Default implementation for sequences is to skip over the messages + const unsigned char *handle_array(uint64_t N, byte_range bytes) { + for (uint64_t i = 0; i < N; i++) { + const unsigned char *next = skip_next_message(bytes.start, bytes.end); + if (!next) { + return nullptr; + } + cb_array_elements(bytes); + bytes.start = next; + } + return bytes.start; + } + const unsigned char *handle_map(uint64_t N, byte_range bytes) { + for (uint64_t i = 0; i < N; i++) { + const unsigned char *start_key = bytes.start; + const unsigned char *end_key = skip_next_message(start_key, bytes.end); + if (!end_key) { + return nullptr; + } + const unsigned char *start_value = end_key; + const unsigned char *end_value = + skip_next_message(start_value, bytes.end); + if (!end_value) { + return nullptr; + } + cb_map_elements({start_key, end_key}, {start_value, end_value}); + bytes.start = end_value; + } + return bytes.start; + } +}; + +typedef enum : uint8_t { +#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) NAME, +#include "msgpack.def" +#undef X +} type; + +[[noreturn]] void internal_error(); +type parse_type(unsigned char x); +unsigned bytes_used_fixed(type ty); + +typedef uint64_t (*payload_info_t)(const unsigned char *); +payload_info_t payload_info(msgpack::type ty); + +template R bitcast(T x); + +template +const unsigned char *handle_msgpack_given_type(byte_range bytes, F f) { + const unsigned char *start = bytes.start; + const unsigned char *end = bytes.end; + const uint64_t available = end - start; + assert(available != 0); + assert(ty == parse_type(*start)); + + const uint64_t bytes_used = bytes_used_fixed(ty); + if (available < bytes_used) { + return 0; + } + const uint64_t available_post_header = available - bytes_used; + + const payload_info_t info = payload_info(ty); + const uint64_t N = info(start); + + switch (ty) { + case msgpack::t: + case msgpack::f: { + // t is 0b11000010, f is 0b11000011, masked with 0x1 + f.cb_boolean(N); + return start + bytes_used; + } + + case msgpack::posfixint: + case msgpack::uint8: + case msgpack::uint16: + case msgpack::uint32: + case msgpack::uint64: { + f.cb_unsigned(N); + return start + bytes_used; + } + + case msgpack::negfixint: + case msgpack::int8: + case msgpack::int16: + case msgpack::int32: + case msgpack::int64: { + f.cb_signed(bitcast(N)); + return start + bytes_used; + } + + case msgpack::fixstr: + case msgpack::str8: + case msgpack::str16: + case msgpack::str32: { + if (available_post_header < N) { + return 0; + } else { + f.cb_string(N, start + bytes_used); + return start + bytes_used + N; + } + } + + case msgpack::fixarray: + case msgpack::array16: + case msgpack::array32: { + return f.cb_array(N, {start + bytes_used, end}); + } + + case msgpack::fixmap: + case msgpack::map16: + case msgpack::map32: { + return f.cb_map(N, {start + bytes_used, end}); + } + + case msgpack::nil: + case msgpack::bin8: + case msgpack::bin16: + case msgpack::bin32: + case msgpack::float32: + case msgpack::float64: + case msgpack::ext8: + case msgpack::ext16: + case msgpack::ext32: + case msgpack::fixext1: + case msgpack::fixext2: + case msgpack::fixext4: + case msgpack::fixext8: + case msgpack::fixext16: + case msgpack::never_used: { + if (available_post_header < N) { + return 0; + } + return start + bytes_used + N; + } + } + internal_error(); +} + +template +const unsigned char *handle_msgpack(byte_range bytes, F f) { + const unsigned char *start = bytes.start; + const unsigned char *end = bytes.end; + const uint64_t available = end - start; + if (available == 0) { + return 0; + } + const type ty = parse_type(*start); + + switch (ty) { +#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ + case msgpack::NAME: \ + return handle_msgpack_given_type(bytes, f); +#include "msgpack.def" +#undef X + } + + internal_error(); +} + +bool message_is_string(byte_range bytes, const char *str); + +template void foronly_string(byte_range bytes, C callback) { + struct inner : functors_defaults { + inner(C &cb) : cb(cb) {} + C &cb; + void handle_string(size_t N, const unsigned char *str) { cb(N, str); } + }; + handle_msgpack(bytes, {callback}); +} + +template void foronly_unsigned(byte_range bytes, C callback) { + struct inner : functors_defaults { + inner(C &cb) : cb(cb) {} + C &cb; + void handle_unsigned(uint64_t x) { cb(x); } + }; + handle_msgpack(bytes, {callback}); +} + +template void foreach_array(byte_range bytes, C callback) { + struct inner : functors_defaults { + inner(C &cb) : cb(cb) {} + C &cb; + void handle_array_elements(byte_range element) { cb(element); } + }; + handle_msgpack(bytes, {callback}); +} + +template void foreach_map(byte_range bytes, C callback) { + struct inner : functors_defaults { + inner(C &cb) : cb(cb) {} + C &cb; + void handle_map_elements(byte_range key, byte_range value) { + cb(key, value); + } + }; + handle_msgpack(bytes, {callback}); +} + +// Crude approximation to json +void dump(byte_range); + +} // namespace msgpack + +#endif diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/msgpack.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/msgpack.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/msgpack.cpp @@ -0,0 +1,271 @@ +//===--- amdgpu/impl/msgpack.cpp ---------------------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include +#include +#include +#include + +#include "msgpack.h" + +namespace msgpack { + +[[noreturn]] void internal_error() { + printf("internal error\n"); + exit(1); +} + +const char *type_name(type ty) { + switch (ty) { +#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ + case NAME: \ + return #NAME; +#include "msgpack.def" +#undef X + } + internal_error(); +} + +unsigned bytes_used_fixed(msgpack::type ty) { + using namespace msgpack; + switch (ty) { +#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ + case NAME: \ + return WIDTH; +#include "msgpack.def" +#undef X + } + internal_error(); +} + +msgpack::type parse_type(unsigned char x) { + +#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ + if (x >= LOWER && x <= UPPER) { \ + return NAME; \ + } else +#include "msgpack.def" +#undef X + { internal_error(); } +} + +template R bitcast(T x) { + static_assert(sizeof(T) == sizeof(R), ""); + R tmp; + memcpy(&tmp, &x, sizeof(T)); + return tmp; +} +template int64_t bitcast(uint64_t); +} // namespace msgpack + +// Helper functions for reading additional payload from the header +// Depending on the type, this can be a number of bytes, elements, +// key-value pairs or an embedded integer. +// Each takes a pointer to the start of the header and returns a uint64_t + +namespace { +namespace payload { +uint64_t read_zero(const unsigned char *) { return 0; } + +// Read the first byte and zero/sign extend it +uint64_t read_embedded_u8(const unsigned char *start) { return start[0]; } +uint64_t read_embedded_s8(const unsigned char *start) { + int64_t res = msgpack::bitcast(start[0]); + return msgpack::bitcast(res); +} + +// Read a masked part of the first byte +uint64_t read_via_mask_0x1(const unsigned char *start) { return *start & 0x1u; } +uint64_t read_via_mask_0xf(const unsigned char *start) { return *start & 0xfu; } +uint64_t read_via_mask_0x1f(const unsigned char *start) { + return *start & 0x1fu; +} + +// Read 1/2/4/8 bytes immediately following the type byte and zero/sign extend +// Big endian format. +uint64_t read_size_field_u8(const unsigned char *from) { + from++; + return from[0]; +} + +// TODO: detect whether host is little endian or not, and whether the intrinsic +// is available. And probably use the builtin to test the diy +const bool use_bswap = false; + +uint64_t read_size_field_u16(const unsigned char *from) { + from++; + if (use_bswap) { + uint16_t b; + memcpy(&b, from, 2); + return __builtin_bswap16(b); + } else { + return (from[0] << 8u) | from[1]; + } +} +uint64_t read_size_field_u32(const unsigned char *from) { + from++; + if (use_bswap) { + uint32_t b; + memcpy(&b, from, 4); + return __builtin_bswap32(b); + } else { + return (from[0] << 24u) | (from[1] << 16u) | (from[2] << 8u) | + (from[3] << 0u); + } +} +uint64_t read_size_field_u64(const unsigned char *from) { + from++; + if (use_bswap) { + uint64_t b; + memcpy(&b, from, 8); + return __builtin_bswap64(b); + } else { + return ((uint64_t)from[0] << 56u) | ((uint64_t)from[1] << 48u) | + ((uint64_t)from[2] << 40u) | ((uint64_t)from[3] << 32u) | + (from[4] << 24u) | (from[5] << 16u) | (from[6] << 8u) | + (from[7] << 0u); + } +} + +uint64_t read_size_field_s8(const unsigned char *from) { + uint8_t u = read_size_field_u8(from); + int64_t res = msgpack::bitcast(u); + return msgpack::bitcast(res); +} +uint64_t read_size_field_s16(const unsigned char *from) { + uint16_t u = read_size_field_u16(from); + int64_t res = msgpack::bitcast(u); + return msgpack::bitcast(res); +} +uint64_t read_size_field_s32(const unsigned char *from) { + uint32_t u = read_size_field_u32(from); + int64_t res = msgpack::bitcast(u); + return msgpack::bitcast(res); +} +uint64_t read_size_field_s64(const unsigned char *from) { + uint64_t u = read_size_field_u64(from); + int64_t res = msgpack::bitcast(u); + return msgpack::bitcast(res); +} +} // namespace payload +} // namespace + +namespace msgpack { + +payload_info_t payload_info(msgpack::type ty) { + using namespace msgpack; + switch (ty) { +#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) \ + case NAME: \ + return payload::PAYLOAD; +#include "msgpack.def" +#undef X + } + internal_error(); +} + +} // namespace msgpack + +const unsigned char *msgpack::skip_next_message(const unsigned char *start, + const unsigned char *end) { + class f : public functors_defaults {}; + return handle_msgpack({start, end}, f()); +} + +namespace msgpack { +bool message_is_string(byte_range bytes, const char *needle) { + bool matched = false; + size_t needleN = strlen(needle); + + foronly_string(bytes, [=, &matched](size_t N, const unsigned char *str) { + if (N == needleN) { + if (memcmp(needle, str, N) == 0) { + matched = true; + } + } + }); + return matched; +} + +void dump(byte_range bytes) { + struct inner : functors_defaults { + inner(unsigned indent) : indent(indent) {} + const unsigned by = 2; + unsigned indent = 0; + + void handle_string(size_t N, const unsigned char *bytes) { + char *tmp = (char *)malloc(N + 1); + memcpy(tmp, bytes, N); + tmp[N] = '\0'; + printf("\"%s\"", tmp); + free(tmp); + } + + void handle_signed(int64_t x) { printf("%ld", x); } + void handle_unsigned(uint64_t x) { printf("%lu", x); } + + const unsigned char *handle_array(uint64_t N, byte_range bytes) { + printf("\n%*s[\n", indent, ""); + indent += by; + + for (uint64_t i = 0; i < N; i++) { + indent += by; + printf("%*s", indent, ""); + const unsigned char *next = handle_msgpack(bytes, {indent}); + printf(",\n"); + indent -= by; + bytes.start = next; + if (!next) { + break; + } + } + indent -= by; + printf("%*s]", indent, ""); + + return bytes.start; + } + + const unsigned char *handle_map(uint64_t N, byte_range bytes) { + printf("\n%*s{\n", indent, ""); + indent += by; + + for (uint64_t i = 0; i < 2 * N; i += 2) { + const unsigned char *start_key = bytes.start; + printf("%*s", indent, ""); + const unsigned char *end_key = + handle_msgpack({start_key, bytes.end}, {indent}); + if (!end_key) { + break; + } + + printf(" : "); + + const unsigned char *start_value = end_key; + const unsigned char *end_value = + handle_msgpack({start_value, bytes.end}, {indent}); + + if (!end_value) { + break; + } + + printf(",\n"); + bytes.start = end_value; + } + + indent -= by; + printf("%*s}", indent, ""); + + return bytes.start; + } + }; + + handle_msgpack(bytes, {0}); + printf("\n"); +} + +} // namespace msgpack diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/msgpack.def b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/msgpack.def new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/msgpack.def @@ -0,0 +1,46 @@ +//===--- amdgpu/impl/msgpack.def ---------------------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// name, header width, reader, [lower, upper] encoding +X(posfixint, 1, read_embedded_u8, 0x00, 0x7f) +X(negfixint, 1, read_embedded_s8, 0xe0, 0xff) +X(fixmap, 1, read_via_mask_0xf, 0x80, 0x8f) +X(fixarray, 1, read_via_mask_0xf, 0x90, 0x9f) +X(fixstr, 1, read_via_mask_0x1f, 0xa0, 0xbf) +X(nil, 1, read_zero, 0xc0, 0xc0) +X(never_used, 1, read_zero, 0xc1, 0xc1) +X(f, 1, read_via_mask_0x1, 0xc2, 0xc2) +X(t, 1, read_via_mask_0x1, 0xc3, 0xc3) +X(bin8, 2, read_size_field_u8, 0xc4, 0xc4) +X(bin16, 3, read_size_field_u16, 0xc5, 0xc5) +X(bin32, 5, read_size_field_u32, 0xc6, 0xc6) +X(ext8, 3, read_size_field_u8, 0xc7, 0xc7) +X(ext16, 4, read_size_field_u16, 0xc8, 0xc8) +X(ext32, 6, read_size_field_u32, 0xc9, 0xc9) +X(float32, 5, read_zero, 0xca, 0xca) +X(float64, 9, read_zero, 0xcb, 0xcb) +X(uint8, 2, read_size_field_u8, 0xcc, 0xcc) +X(uint16, 3, read_size_field_u16, 0xcd, 0xcd) +X(uint32, 5, read_size_field_u32, 0xce, 0xce) +X(uint64, 9, read_size_field_u64, 0xcf, 0xcf) +X(int8, 2, read_size_field_s8, 0xd0, 0xd0) +X(int16, 3, read_size_field_s16, 0xd1, 0xd1) +X(int32, 5, read_size_field_s32, 0xd2, 0xd2) +X(int64, 9, read_size_field_s64, 0xd3, 0xd3) +X(fixext1, 3, read_zero, 0xd4, 0xd4) +X(fixext2, 4, read_zero, 0xd5, 0xd5) +X(fixext4, 6, read_zero, 0xd6, 0xd6) +X(fixext8, 10, read_zero, 0xd7, 0xd7) +X(fixext16, 18, read_zero, 0xd8, 0xd8) +X(str8, 2, read_size_field_u8, 0xd9, 0xd9) +X(str16, 3, read_size_field_u16, 0xda, 0xda) +X(str32, 5, read_size_field_u32, 0xdb, 0xdb) +X(array16, 3, read_size_field_u16, 0xdc, 0xdc) +X(array32, 5, read_size_field_u32, 0xdd, 0xdd) +X(map16, 3, read_size_field_u16, 0xde, 0xde) +X(map32, 5, read_size_field_u32, 0xdf, 0xdf) diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -196,6 +196,13 @@ return false; } +protected: + /// When overridden, prints plugin-specific kernel launch information. + /// Otherwise, default info. + virtual Error printLaunchInfo(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, uint32_t NumThreads, + uint64_t NumBlocks) const; + private: /// Prepare the arguments before launching the kernel. void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs, @@ -225,6 +232,7 @@ } bool isSPMDMode() const { return ExecutionMode == OMP_TGT_EXEC_MODE_SPMD; } +protected: /// Get the execution mode name of the kernel. const char *getExecutionModeName() const { switch (ExecutionMode) { @@ -238,6 +246,7 @@ llvm_unreachable("Unknown execution mode!"); } +private: /// The kernel name. const char *Name; diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -218,6 +218,17 @@ return initImpl(GenericDevice, Image); } +Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, + uint32_t NumThreads, + uint64_t NumBlocks) const { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), + "Launching kernel %s with %" PRIu64 + " blocks and %d threads in %s mode\n", + getName(), NumBlocks, NumThreads, getExecutionModeName()); + return Plugin::success(); +} + Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, AsyncInfoWrapperTy &AsyncInfoWrapper) const { @@ -232,10 +243,10 @@ uint64_t NumBlocks = getNumBlocks(GenericDevice, KernelArgs.NumTeams, KernelArgs.Tripcount, NumThreads); - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), - "Launching kernel %s with %" PRIu64 - " blocks and %d threads in %s mode\n", - getName(), NumBlocks, NumThreads, getExecutionModeName()); + Error Err = printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks); + if (Err) { + return Err; + } return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs, KernelArgsPtr, AsyncInfoWrapper);