diff --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt --- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt +++ b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt @@ -61,6 +61,8 @@ set(LIBOMPTARGET_DEP_LIBRARIES) endif() +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti") + add_library(omptarget.rtl.amdgpu SHARED impl/impl.cpp impl/interop_hsa.cpp @@ -89,6 +91,8 @@ target_link_libraries( omptarget.rtl.amdgpu PRIVATE + DeviceInterface + GlobalHandler elf_common ${LIBOMPTARGET_DEP_LIBRARIES} ${CMAKE_DL_LIBS} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h --- a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h @@ -8,8 +8,8 @@ #ifndef INCLUDE_INTEROP_HSA_H_ #define INCLUDE_INTEROP_HSA_H_ -#include "impl_runtime.h" #include "hsa_api.h" +#include "impl_runtime.h" #include "internal.h" #include @@ -17,15 +17,18 @@ extern "C" { -hsa_status_t interop_hsa_get_symbol_info( - const std::map &SymbolInfoTable, - int DeviceId, const char *symbol, void **var_addr, unsigned int *var_size); +typedef std::map SymbolInfoTableTy; +typedef std::map KernelInfoTableTy; -hsa_status_t interop_hsa_get_kernel_info( - const std::map &KernelInfoTable, - int DeviceId, const char *kernel_name, hsa_executable_symbol_info_t info, - uint32_t *value); +hsa_status_t +interop_hsa_get_symbol_info(const SymbolInfoTableTy &SymbolInfoTable, + int DeviceId, const char *symbol, void **var_addr, + unsigned int *var_size); +hsa_status_t +interop_hsa_get_kernel_info(const KernelInfoTableTy &KernelInfoTable, + int DeviceId, const char *kernel_name, + hsa_executable_symbol_info_t info, uint32_t *value); } #endif // INCLUDE_INTEROP_HSA_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -24,13 +23,18 @@ #include #include -#include "interop_hsa.h" +#include "Debug.h" #include "impl_runtime.h" +#include "interop_hsa.h" #include "internal.h" +#include "omptarget.h" #include "rt.h" #include "DeviceEnvironment.h" + +#include "GlobalHandler.h" + #include "get_elf_mach_gfx_name.h" #include "omptargetplugin.h" #include "print_tracing.h" @@ -38,6 +42,36 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" +using namespace llvm::omp::plugin; + +int32_t llvm::omp::plugin::GlobalHandlerTy::getGlobalMetadataFromDevice( + int32_t DeviceId, GlobalTy &DeviceGlobal, void *SymbolInfoTablePtr) { + SymbolInfoTableTy &SymbolInfoTable = + *static_cast(SymbolInfoTablePtr); + void *DevPtr; + unsigned DevSize; + const char *Name = DeviceGlobal.getName().c_str(); + hsa_status_t Err = interop_hsa_get_symbol_info(SymbolInfoTable, DeviceId, + Name, &DevPtr, &DevSize); + + if (Err != HSA_STATUS_SUCCESS) { + // Inform the user what symbol prevented offloading + DP("Loading global '%s' (Failed)\n", Name); + return OFFLOAD_FAIL; + } + + if (DevSize != DeviceGlobal.getSize()) { + DP("Loading global '%s' - size mismatch (%u != %u)\n", Name, DevSize, + DeviceGlobal.getSize()); + return OFFLOAD_FAIL; + } + + DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", + DPxPTR(e - HostBegin), Name, DPxPTR(DevPtr)); + DeviceGlobal.setPtr(DevPtr); + return OFFLOAD_SUCCESS; +} + // hostrpc interface, FIXME: consider moving to its own include these are // statically linked into amdgpu/plugin if present from hostrpc_services.a, // linked as --whole-archive to override the weak symbols that are used to @@ -416,12 +450,14 @@ // Resource pools SignalPoolT FreeSignalPool; + GlobalHandlerTy GlobalHandler; + bool hostcall_required = false; std::vector HSAExecutables; - std::vector> KernelInfoTable; - std::vector> SymbolInfoTable; + std::vector KernelInfoTables; + std::vector SymbolInfoTables; hsa_amd_memory_pool_t KernArgPool; @@ -708,8 +744,8 @@ NumTeams.resize(NumberOfDevices); NumThreads.resize(NumberOfDevices); deviceStateStore.resize(NumberOfDevices); - KernelInfoTable.resize(NumberOfDevices); - SymbolInfoTable.resize(NumberOfDevices); + KernelInfoTables.resize(NumberOfDevices); + SymbolInfoTables.resize(NumberOfDevices); DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); DeviceFineGrainedMemoryPools.resize(NumberOfDevices); @@ -892,30 +928,6 @@ return r; } -uint32_t elf_e_flags(__tgt_device_image *image) { - char *img_begin = (char *)image->ImageStart; - size_t img_size = (char *)image->ImageEnd - img_begin; - - Elf *e = elf_memory(img_begin, img_size); - if (!e) { - DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); - return 0; - } - - Elf64_Ehdr *eh64 = elf64_getehdr(e); - - if (!eh64) { - DP("Unable to get machine ID from ELF file!\n"); - elf_end(e); - return 0; - } - - uint32_t Flags = eh64->e_flags; - - elf_end(e); - DP("ELF Flags: 0x%x\n", Flags); - return Flags; -} } // namespace int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { @@ -1104,144 +1116,9 @@ } namespace { -Elf64_Shdr *find_only_SHT_HASH(Elf *elf) { - size_t N; - int rc = elf_getshdrnum(elf, &N); - if (rc != 0) { - return nullptr; - } - - Elf64_Shdr *result = nullptr; - for (size_t i = 0; i < N; i++) { - Elf_Scn *scn = elf_getscn(elf, i); - if (scn) { - Elf64_Shdr *shdr = elf64_getshdr(scn); - if (shdr) { - if (shdr->sh_type == SHT_HASH) { - if (result == nullptr) { - result = shdr; - } else { - // multiple SHT_HASH sections not handled - return nullptr; - } - } - } - } - } - return result; -} - -const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash, - const char *symname) { - - assert(section_hash); - size_t section_symtab_index = section_hash->sh_link; - Elf64_Shdr *section_symtab = - elf64_getshdr(elf_getscn(elf, section_symtab_index)); - size_t section_strtab_index = section_symtab->sh_link; - - const Elf64_Sym *symtab = - reinterpret_cast(base + section_symtab->sh_offset); - - const uint32_t *hashtab = - reinterpret_cast(base + section_hash->sh_offset); - - // Layout: - // nbucket - // nchain - // bucket[nbucket] - // chain[nchain] - uint32_t nbucket = hashtab[0]; - const uint32_t *bucket = &hashtab[2]; - const uint32_t *chain = &hashtab[nbucket + 2]; - - const size_t max = strlen(symname) + 1; - const uint32_t hash = elf_hash(symname); - for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) { - char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name); - if (strncmp(symname, n, max) == 0) { - return &symtab[i]; - } - } - - return nullptr; -} - -struct symbol_info { - void *addr = nullptr; - uint32_t size = UINT32_MAX; - uint32_t sh_type = SHT_NULL; -}; - -int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname, - symbol_info *res) { - if (elf_kind(elf) != ELF_K_ELF) { - return 1; - } - - Elf64_Shdr *section_hash = find_only_SHT_HASH(elf); - if (!section_hash) { - return 1; - } - - const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname); - if (!sym) { - return 1; - } - - if (sym->st_size > UINT32_MAX) { - return 1; - } - - if (sym->st_shndx == SHN_UNDEF) { - return 1; - } - - Elf_Scn *section = elf_getscn(elf, sym->st_shndx); - if (!section) { - return 1; - } - - Elf64_Shdr *header = elf64_getshdr(section); - if (!header) { - return 1; - } - - res->addr = sym->st_value + base; - res->size = static_cast(sym->st_size); - res->sh_type = header->sh_type; - return 0; -} - -int get_symbol_info_without_loading(char *base, size_t img_size, - const char *symname, symbol_info *res) { - Elf *elf = elf_memory(base, img_size); - if (elf) { - int rc = get_symbol_info_without_loading(elf, base, symname, res); - elf_end(elf); - return rc; - } - return 1; -} - -hsa_status_t interop_get_symbol_info(char *base, size_t img_size, - const char *symname, void **var_addr, - uint32_t *var_size) { - symbol_info si; - int rc = get_symbol_info_without_loading(base, img_size, symname, &si); - if (rc == 0) { - *var_addr = si.addr; - *var_size = si.size; - return HSA_STATUS_SUCCESS; - } else { - return HSA_STATUS_ERROR; - } -} - template hsa_status_t module_register_from_memory_to_place( - std::map &KernelInfoTable, - std::map &SymbolInfoTable, + KernelInfoTableTy &KernelInfoTable, SymbolInfoTableTy &SymbolInfoTable, void *module_bytes, size_t module_size, int DeviceId, C cb, std::vector &HSAExecutables) { auto L = [](void *data, size_t size, void *cb_state) -> hsa_status_t { @@ -1255,27 +1132,6 @@ } } // namespace -static uint64_t get_device_State_bytes(char *ImageStart, size_t img_size) { - uint64_t device_State_bytes = 0; - { - // If this is the deviceRTL, get the state variable size - symbol_info size_si; - int rc = get_symbol_info_without_loading( - ImageStart, img_size, "omptarget_nvptx_device_State_size", &size_si); - - if (rc == 0) { - if (size_si.size != sizeof(uint64_t)) { - DP("Found device_State_size variable with wrong size\n"); - return 0; - } - - // Read number of bytes directly from the elf - memcpy(&device_State_bytes, size_si.addr, sizeof(uint64_t)); - } - } - return device_State_bytes; -} - static __tgt_target_table * __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image); @@ -1290,102 +1146,6 @@ return res; } -struct device_environment { - // initialise an DeviceEnvironmentTy in the deviceRTL - // patches around differences in the deviceRTL between trunk, aomp, - // rocmcc. Over time these differences will tend to zero and this class - // simplified. - // Symbol may be in .data or .bss, and may be missing fields, todo: - // review aomp/trunk/rocm and simplify the following - - // The symbol may also have been deadstripped because the device side - // accessors were unused. - - // If the symbol is in .data (aomp, rocm) it can be written directly. - // If it is in .bss, we must wait for it to be allocated space on the - // gpu (trunk) and initialize after loading. - const char *sym() { return "omptarget_device_environment"; } - - DeviceEnvironmentTy host_device_env; - symbol_info si; - bool valid = false; - - __tgt_device_image *image; - const size_t img_size; - - device_environment(int device_id, int number_devices, - __tgt_device_image *image, const size_t img_size) - : image(image), img_size(img_size) { - - host_device_env.NumDevices = number_devices; - host_device_env.DeviceNum = device_id; - host_device_env.DebugKind = 0; - host_device_env.DynamicMemSize = 0; - if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { - host_device_env.DebugKind = std::stoi(envStr); - } - - int rc = get_symbol_info_without_loading((char *)image->ImageStart, - img_size, sym(), &si); - if (rc != 0) { - DP("Finding global device environment '%s' - symbol missing.\n", sym()); - return; - } - - if (si.size > sizeof(host_device_env)) { - DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), si.size, - sizeof(host_device_env)); - return; - } - - valid = true; - } - - bool in_image() { return si.sh_type != SHT_NOBITS; } - - hsa_status_t before_loading(void *data, size_t size) { - if (valid) { - if (in_image()) { - DP("Setting global device environment before load (%u bytes)\n", - si.size); - uint64_t offset = (char *)si.addr - (char *)image->ImageStart; - void *pos = (char *)data + offset; - memcpy(pos, &host_device_env, si.size); - } - } - return HSA_STATUS_SUCCESS; - } - - hsa_status_t after_loading() { - if (valid) { - if (!in_image()) { - DP("Setting global device environment after load (%u bytes)\n", - si.size); - int device_id = host_device_env.DeviceNum; - auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id]; - void *state_ptr; - uint32_t state_ptr_size; - hsa_status_t err = interop_hsa_get_symbol_info( - SymbolInfo, device_id, sym(), &state_ptr, &state_ptr_size); - if (err != HSA_STATUS_SUCCESS) { - DP("failed to find %s in loaded image\n", sym()); - return err; - } - - if (state_ptr_size != si.size) { - DP("Symbol had size %u before loading, %u after\n", state_ptr_size, - si.size); - return HSA_STATUS_ERROR; - } - - return DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &host_device_env, - state_ptr_size, device_id); - } - } - return HSA_STATUS_SUCCESS; - } -}; - static hsa_status_t impl_calloc(void **ret_ptr, size_t size, int DeviceId) { uint64_t rounded = 4 * ((size + 3) / 4); void *ptr; @@ -1406,12 +1166,6 @@ return HSA_STATUS_SUCCESS; } -static bool image_contains_symbol(void *data, size_t size, const char *sym) { - symbol_info si; - int rc = get_symbol_info_without_loading((char *)data, size, sym, &si); - return (rc == 0) && (si.addr != nullptr); -} - __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image) { // This function loads the device image onto gpu[device_id] and does other @@ -1447,42 +1201,27 @@ return NULL; } + // send device environment data to the device { - auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image, - img_size); - - auto &KernelInfo = DeviceInfo.KernelInfoTable[device_id]; - auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id]; - hsa_status_t err = module_register_from_memory_to_place( - KernelInfo, SymbolInfo, (void *)image->ImageStart, img_size, device_id, - [&](void *data, size_t size) { - if (image_contains_symbol(data, size, "needs_hostcall_buffer")) { - __atomic_store_n(&DeviceInfo.hostcall_required, true, - __ATOMIC_RELEASE); - } - return env.before_loading(data, size); - }, - DeviceInfo.HSAExecutables); - - check("Module registering", err); - if (err != HSA_STATUS_SUCCESS) { - const char *DeviceName = DeviceInfo.GPUName[device_id].c_str(); - const char *ElfName = get_elf_mach_gfx_name(elf_e_flags(image)); - - if (strcmp(DeviceName, ElfName) != 0) { - DP("Possible gpu arch mismatch: device:%s, image:%s please check" - " compiler flag: -march=\n", - DeviceName, ElfName); - } else { - DP("Error loading image onto GPU: %s\n", get_error_string(err)); - } - - return NULL; - } - - err = env.after_loading(); - if (err != HSA_STATUS_SUCCESS) { - return NULL; + uint32_t DynamicMemorySize = 0; + uint32_t DebugKind = 0; + if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) + DebugKind = std::stoi(EnvStr); + + // TODO: The device ID used here is not the real device ID used by OpenMP. + StaticGlobalTy DeviceEnvGlobal( + "omptarget_device_environment", DebugKind, + static_cast(DeviceInfo.NumberOfDevices), + static_cast(device_id), + static_cast(DynamicMemorySize)); + auto &SymbolInfoTable = DeviceInfo.SymbolInfoTables[device_id]; + // TODO: Implement "writeGlobalToImage" in the GlobalHandler. + if (!DeviceInfo.GlobalHandler.writeGlobalToDevice( + device_id, DeviceEnvGlobal, &SymbolInfoTable)) { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, device_id, + "Failed to write device environment, abort."); + // TODO: Check the device gfx name against the image gfx name. + return nullptr; } } @@ -1492,62 +1231,48 @@ // the device_State array is either large value in bss or a void* that // needs to be assigned to a pointer to an array of size device_state_bytes // If absent, it has been deadstripped and needs no setup. - - void *state_ptr; - uint32_t state_ptr_size; - auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; - hsa_status_t err = interop_hsa_get_symbol_info( - SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr, - &state_ptr_size); - - if (err != HSA_STATUS_SUCCESS) { - DP("No device_state symbol found, skipping initialization\n"); + StaticGlobalTy DeviceStateGlobal("omptarget_nvptx_device_State"); + auto &SymbolInfoMap = DeviceInfo.SymbolInfoTables[device_id]; + if (!DeviceInfo.GlobalHandler.getGlobalMetadataFromDevice( + device_id, DeviceStateGlobal, &SymbolInfoMap)) { + DP("No device_state pointer symbol found, skipping initialization\n"); } else { - if (state_ptr_size < sizeof(void *)) { - DP("unexpected size of state_ptr %u != %zu\n", state_ptr_size, - sizeof(void *)); + StaticGlobalTy DeviceStateSizeGlobal( + "omptarget_nvptx_device_State_size"); + if (!DeviceInfo.GlobalHandler.readGlobalFromImage( + device_id, DeviceStateGlobal, (char *)image->ImageStart, + img_size)) { + DP("Can't initialize device_State, missing size information\n"); return NULL; } - - // if it's larger than a void*, assume it's a bss array and no further - // initialization is required. Only try to set up a pointer for - // sizeof(void*) - if (state_ptr_size == sizeof(void *)) { - uint64_t device_State_bytes = - get_device_State_bytes((char *)image->ImageStart, img_size); - if (device_State_bytes == 0) { - DP("Can't initialize device_State, missing size information\n"); + hsa_status_t Err; + uint64_t DeviceStateSize = DeviceStateSizeGlobal.getValue(); + auto &DSS = DeviceInfo.deviceStateStore[device_id]; + if (DSS.first.get() == nullptr) { + assert(DSS.second == 0); + void *Ptr = NULL; + Err = impl_calloc(&Ptr, DeviceStateSize, device_id); + if (Err != HSA_STATUS_SUCCESS) { + DP("Failed to allocate device_state array\n"); return NULL; } + DSS = { + std::unique_ptr{Ptr}, + DeviceStateSize, + }; + } - auto &dss = DeviceInfo.deviceStateStore[device_id]; - if (dss.first.get() == nullptr) { - assert(dss.second == 0); - void *ptr = NULL; - hsa_status_t err = impl_calloc(&ptr, device_State_bytes, device_id); - if (err != HSA_STATUS_SUCCESS) { - DP("Failed to allocate device_state array\n"); - return NULL; - } - dss = { - std::unique_ptr{ptr}, - device_State_bytes, - }; - } - - void *ptr = dss.first.get(); - if (device_State_bytes != dss.second) { - DP("Inconsistent sizes of device_State unsupported\n"); - return NULL; - } + if (DeviceStateSize != DSS.second) { + DP("Inconsistent sizes of device_State unsupported\n"); + return NULL; + } - // write ptr to device memory so it can be used by later kernels - err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, - sizeof(void *), device_id); - if (err != HSA_STATUS_SUCCESS) { - DP("memcpy install of state_ptr failed\n"); - return NULL; - } + // write ptr to device memory so it can be used by later kernels + DeviceStateGlobal.setValue(DSS.first.get()); + if (!DeviceInfo.GlobalHandler.writeGlobalToDevice( + device_id, DeviceStateGlobal, &SymbolInfoMap)) { + DP("memcpy install of state_ptr failed\n"); + return NULL; } } } @@ -1563,6 +1288,8 @@ __tgt_offload_entry *HostBegin = image->EntriesBegin; __tgt_offload_entry *HostEnd = image->EntriesEnd; + // TODO: This is basically the same in the AMDGPU and CUDA plugin, + // refactor. for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { if (!e->addr) { @@ -1576,28 +1303,12 @@ if (e->size) { __tgt_offload_entry entry = *e; - void *varptr; - uint32_t varsize; - - auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; - hsa_status_t err = interop_hsa_get_symbol_info( - SymbolInfoMap, device_id, e->name, &varptr, &varsize); - - if (err != HSA_STATUS_SUCCESS) { - // Inform the user what symbol prevented offloading - DP("Loading global '%s' (Failed)\n", e->name); - return NULL; - } - - if (varsize != e->size) { - DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name, - varsize, e->size); - return NULL; - } - - DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", - DPxPTR(e - HostBegin), e->name, DPxPTR(varptr)); - entry.addr = (void *)varptr; + StaticGlobalTy Global(e->name); + auto &SymbolInfoMap = DeviceInfo.SymbolInfoTables[device_id]; + if (!DeviceInfo.GlobalHandler.getGlobalMetadataFromDevice( + device_id, Global, &SymbolInfoMap)) + return nullptr; + entry.addr = Global.getPtr(); DeviceInfo.addOffloadEntry(device_id, entry); @@ -1606,13 +1317,10 @@ // If unified memory is present any target link variables // can access host addresses directly. There is no longer a // need for device copies. - err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr, - sizeof(void *), device_id); - if (err != HSA_STATUS_SUCCESS) - DP("Error when copying USM\n"); - DP("Copy linked variable host address (" DPxMOD ")" - "to device address (" DPxMOD ")\n", - DPxPTR(*((void **)e->addr)), DPxPTR(varptr)); + Global.setValue(e->addr); + if (!DeviceInfo.GlobalHandler.writeGlobalToDevice(device_id, Global, + &SymbolInfoMap)) + return nullptr; } continue; @@ -1621,11 +1329,12 @@ DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name)); uint32_t kernarg_segment_size; - auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id]; + auto &KernelInfoMap = DeviceInfo.KernelInfoTables[device_id]; hsa_status_t err = interop_hsa_get_kernel_info( KernelInfoMap, device_id, e->name, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size); + (void)err; // each arg is a void * in this openmp implementation uint32_t arg_num = kernarg_segment_size / sizeof(void *); @@ -1635,10 +1344,6 @@ *it = sizeof(void *); } - // default value GENERIC (in case symbol is missing from cubin file) - llvm::omp::OMPTgtExecModeFlags ExecModeVal = - llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; - // get flat group size if present, else Default_WG_Size int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; @@ -1649,125 +1354,45 @@ uint16_t TSize; uint16_t WG_Size; }; - struct KernDescValType KernDescVal; - std::string KernDescNameStr(e->name); - KernDescNameStr += "_kern_desc"; - const char *KernDescName = KernDescNameStr.c_str(); - - void *KernDescPtr; - uint32_t KernDescSize; - void *CallStackAddr = nullptr; - err = interop_get_symbol_info((char *)image->ImageStart, img_size, - KernDescName, &KernDescPtr, &KernDescSize); - - if (err == HSA_STATUS_SUCCESS) { - if ((size_t)KernDescSize != sizeof(KernDescVal)) - DP("Loading global computation properties '%s' - size mismatch (%u != " - "%lu)\n", - KernDescName, KernDescSize, sizeof(KernDescVal)); - - memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); - - // Check structure size against recorded size. - if ((size_t)KernDescSize != KernDescVal.TSize) - DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", - sizeof(KernDescVal), KernDescVal.TSize, KernDescName); - - DP("After loading global for %s KernDesc \n", KernDescName); - DP("KernDesc: Version: %d\n", KernDescVal.Version); - DP("KernDesc: TSize: %d\n", KernDescVal.TSize); - DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size); - if (KernDescVal.WG_Size == 0) { - KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size; - DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size); - } - WGSizeVal = KernDescVal.WG_Size; - DP("WGSizeVal %d\n", WGSizeVal); - check("Loading KernDesc computation property", err); - } else { - DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); - - // Flat group size - std::string WGSizeNameStr(e->name); - WGSizeNameStr += "_wg_size"; - const char *WGSizeName = WGSizeNameStr.c_str(); - - void *WGSizePtr; - uint32_t WGSize; - err = interop_get_symbol_info((char *)image->ImageStart, img_size, - WGSizeName, &WGSizePtr, &WGSize); - - if (err == HSA_STATUS_SUCCESS) { - if ((size_t)WGSize != sizeof(int16_t)) { - DP("Loading global computation properties '%s' - size mismatch (%u " - "!= " - "%lu)\n", - WGSizeName, WGSize, sizeof(int16_t)); - return NULL; - } - - memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); - - DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); - - if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size || - WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) { - DP("Error wrong WGSize value specified in HSA code object file: " - "%d\n", + // Read the kernel description global from the binary. + StaticGlobalTy KernelDescGlobal(e->name, "_kern_desc"); + if (!DeviceInfo.GlobalHandler.readGlobalFromImage( + device_id, KernelDescGlobal, (char *)image->ImageStart, img_size)) { + // No kernel description available, fallback to work group size global: + // Read work group size global from the binary. + StaticGlobalTy WGSizeGlobal(e->name, "_wg_size"); + if (!DeviceInfo.GlobalHandler.readGlobalFromImage( + device_id, WGSizeGlobal, (char *)image->ImageStart, img_size)) { + WGSizeGlobal.setValue(WGSizeVal); + INFO(OMP_INFOTYPE_DATA_TRANSFER, device_id, + "Failed to work group size for %s, defaulting to %i.", e->name, WGSizeVal); - WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; - } - } else { - DP("Warning: Loading WGSize '%s' - symbol not found, " - "using default value %d\n", - WGSizeName, WGSizeVal); } - - check("Loading WGSize computation property", err); + WGSizeVal = WGSizeGlobal.getValue(); + } else if (KernelDescGlobal.getValue().WG_Size) { + if (sizeof(KernDescValType) != KernelDescGlobal.getValue().TSize) + DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", + sizeof(KernDescValType), KernelDescGlobal.getValue().TSize, + KernelDescGlobal.getName().c_str()); + WGSizeVal = KernelDescGlobal.getValue().WG_Size; } - // Read execution mode from global in binary - std::string ExecModeNameStr(e->name); - ExecModeNameStr += "_exec_mode"; - const char *ExecModeName = ExecModeNameStr.c_str(); - - void *ExecModePtr; - uint32_t varsize; - err = interop_get_symbol_info((char *)image->ImageStart, img_size, - ExecModeName, &ExecModePtr, &varsize); - - if (err == HSA_STATUS_SUCCESS) { - if ((size_t)varsize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { - DP("Loading global computation properties '%s' - size mismatch(%u != " - "%lu)\n", - ExecModeName, varsize, sizeof(llvm::omp::OMPTgtExecModeFlags)); - return NULL; - } - - memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize); - - DP("After loading global for %s ExecMode = %d\n", ExecModeName, - ExecModeVal); - - if (ExecModeVal < 0 || - ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { - DP("Error wrong exec_mode value specified in HSA code object file: " - "%d\n", - ExecModeVal); - return NULL; - } - } else { - DP("Loading global exec_mode '%s' - symbol missing, using default " - "value " - "GENERIC (1)\n", - ExecModeName); + // Read execution mode global from the binary + StaticGlobalTy ExecModeGlobal(e->name, + "_exec_mode"); + if (!DeviceInfo.GlobalHandler.readGlobalFromImage( + device_id, ExecModeGlobal, (char *)image->ImageStart, img_size)) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, device_id, + "Failed to read execution mode for %s, defaulting to SPMD.", + e->name); + ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_SPMD); } - check("Loading computation property", err); - KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id, - CallStackAddr, e->name, kernarg_segment_size, - DeviceInfo.KernArgPool)); + void *CallStackAddr = nullptr; + KernelsList.push_back( + KernelTy(ExecModeGlobal.getValue(), WGSizeVal, device_id, CallStackAddr, + e->name, kernarg_segment_size, DeviceInfo.KernArgPool)); __tgt_offload_entry entry = *e; entry.addr = (void *)&KernelsList.back(); DeviceInfo.addOffloadEntry(device_id, entry); @@ -2054,15 +1679,15 @@ KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; std::string kernel_name = std::string(KernelInfo->Name); - auto &KernelInfoTable = DeviceInfo.KernelInfoTable; - if (KernelInfoTable[device_id].find(kernel_name) == - KernelInfoTable[device_id].end()) { + auto &KernelInfoTables = DeviceInfo.KernelInfoTables; + if (KernelInfoTables[device_id].find(kernel_name) == + KernelInfoTables[device_id].end()) { DP("Kernel %s not found\n", kernel_name.c_str()); return OFFLOAD_FAIL; } const atl_kernel_info_t KernelInfoEntry = - KernelInfoTable[device_id][kernel_name]; + KernelInfoTables[device_id][kernel_name]; const uint32_t group_segment_size = KernelInfoEntry.group_segment_size; const uint32_t sgpr_count = KernelInfoEntry.sgpr_count; const uint32_t vgpr_count = KernelInfoEntry.vgpr_count; diff --git a/openmp/libomptarget/plugins/common/CMakeLists.txt b/openmp/libomptarget/plugins/common/CMakeLists.txt --- a/openmp/libomptarget/plugins/common/CMakeLists.txt +++ b/openmp/libomptarget/plugins/common/CMakeLists.txt @@ -10,5 +10,7 @@ # ##===----------------------------------------------------------------------===## +add_subdirectory(DeviceInterface) add_subdirectory(elf_common) add_subdirectory(MemoryManager) +add_subdirectory(GlobalHandler) diff --git a/openmp/libomptarget/plugins/common/CMakeLists.txt b/openmp/libomptarget/plugins/common/DeviceInterface/CMakeLists.txt copy from openmp/libomptarget/plugins/common/CMakeLists.txt copy to openmp/libomptarget/plugins/common/DeviceInterface/CMakeLists.txt --- a/openmp/libomptarget/plugins/common/CMakeLists.txt +++ b/openmp/libomptarget/plugins/common/DeviceInterface/CMakeLists.txt @@ -5,10 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # ##===----------------------------------------------------------------------===## -# -# Common parts which can be used by all plugins -# -##===----------------------------------------------------------------------===## -add_subdirectory(elf_common) -add_subdirectory(MemoryManager) +add_library(DeviceInterface INTERFACE) + +target_include_directories(DeviceInterface INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/openmp/libomptarget/plugins/common/DeviceInterface/DeviceInterface.h b/openmp/libomptarget/plugins/common/DeviceInterface/DeviceInterface.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/common/DeviceInterface/DeviceInterface.h @@ -0,0 +1,48 @@ +//===- DeviceInterface.h - Target independent plugin device interface -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_DEVICEINTERFACE_DEVICEINTERFACE_H +#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_DEVICEINTERFACE_DEVICEINTERFACE_H + +#include "omptarget.h" + +extern "C" { +int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, + int64_t Size); +int32_t __tgt_rtl_data_submit_async(int32_t ID, void *TargetPtr, void *HostPtr, + int64_t Size, __tgt_async_info *AsyncInfo); +int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, + int64_t Size); +int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr, + void *TargetPtr, int64_t Size, + __tgt_async_info *AsyncInfo); +} + +namespace llvm { +namespace omp { +namespace plugin { + +struct DeviceInterfaceTy { + static int32_t memcpyDtoH(int32_t DeviceId, void *Dst, const void *Src, + int32_t Size) { + return __tgt_rtl_data_retrieve(DeviceId, Dst, const_cast(Src), + Size); + } + static int32_t memcpyHtoD(int32_t DeviceId, void *Dst, const void *Src, + int32_t Size) { + return __tgt_rtl_data_submit(DeviceId, Dst, const_cast(Src), Size); + } +}; + +} // namespace plugin +} // namespace omp +} // namespace llvm + +#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_DEVICEINTERFACE_DEVICEINTERFACE_H diff --git a/openmp/libomptarget/plugins/common/CMakeLists.txt b/openmp/libomptarget/plugins/common/GlobalHandler/CMakeLists.txt copy from openmp/libomptarget/plugins/common/CMakeLists.txt copy to openmp/libomptarget/plugins/common/GlobalHandler/CMakeLists.txt --- a/openmp/libomptarget/plugins/common/CMakeLists.txt +++ b/openmp/libomptarget/plugins/common/GlobalHandler/CMakeLists.txt @@ -5,10 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # ##===----------------------------------------------------------------------===## -# -# Common parts which can be used by all plugins -# -##===----------------------------------------------------------------------===## -add_subdirectory(elf_common) -add_subdirectory(MemoryManager) +add_library(GlobalHandler INTERFACE) + +target_include_directories(GlobalHandler INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/openmp/libomptarget/plugins/common/GlobalHandler/GlobalHandler.h b/openmp/libomptarget/plugins/common/GlobalHandler/GlobalHandler.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/common/GlobalHandler/GlobalHandler.h @@ -0,0 +1,253 @@ +//===- GlobalHandler.h - Target independent global & enviroment handling --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Target independent global handler and environment manager. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_GLOBALHANDLER_GLOBALHANDLER_H +#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_GLOBALHANDLER_GLOBALHANDLER_H + +#include + +#include "Debug.h" +#include "omptarget.h" +#include "llvm/Object/ELFObjectFile.h" + +#include "DeviceInterface.h" + +namespace llvm { +namespace omp { +namespace plugin { + +using namespace llvm::object; + +/// Common abstraction for globals that live on the host and device. +/// It simply encapsulates the symbol name, symbol size, and symbol address +/// (which might be host or device depending on the context). +/// TODO: We should probably keep both the host and device pointer in this +/// structure to avoid multiple lookups, e.g. when we write device globals +/// in USM mode after we looked up their device address. +class GlobalTy { + std::string Name; + uint32_t Size; + void *Ptr; + +public: + GlobalTy(const std::string &Name, uint32_t Size, void *Ptr = nullptr) + : Name(Name), Size(Size), Ptr(Ptr) {} + + const std::string &getName() const { return Name; } + uint32_t getSize() const { return Size; } + void *getPtr() const { return Ptr; } + + void setSize(int32_t S) { Size = S; } + void setPtr(void *P) { Ptr = P; } +}; + +/// Subclass of GlobalTy that holds the memory for a global of \p Ty. +template class StaticGlobalTy : public GlobalTy { + Ty Data; + +public: + template + StaticGlobalTy(const std::string &Name, Args &&...args) + : GlobalTy(Name, sizeof(Ty), &Data), + Data(Ty{std::forward(args)...}) {} + template + StaticGlobalTy(const char *Name, Args &&...args) + : GlobalTy(Name, sizeof(Ty), &Data), + Data(Ty{std::forward(args)...}) {} + template + StaticGlobalTy(const char *Name, const char *Suffix, Args &&...args) + : GlobalTy(std::string(Name) + Suffix, sizeof(Ty), &Data), + Data(Ty{std::forward(args)...}) {} + + Ty &getValue() { return Data; } + const Ty &getValue() const { return Data; } + void setValue(const Ty &V) { Data = V; } +}; + +/// Subclass of GlobalTy that holds the memory which may exceed the global type +/// \p Ty. +template class DynamicGlobalTy : public GlobalTy { +public: + DynamicGlobalTy(const std::string &Name, uint32_t Size) + : GlobalTy(Name, Size, malloc(Size)) {} + DynamicGlobalTy(const char *Name, const char *Suffix, uint32_t Size) + : GlobalTy(std::string(Name) + Suffix, Size, malloc(Size)) {} + ~DynamicGlobalTy() { free(getPtr()); } + + Ty &getValue() { return *static_cast(getPtr()); } + const Ty &getValue() const { return *static_cast(getPtr()); } + void setValue(const Ty &V) { *getPtr() = V; } +}; + +/// Helper class to do the heavy lifting when it comes to moving globals between +/// host and device. Through the DeviceInterfaceTy we access memcpyDtoH and +/// HtoD, which means the only things specialized by the subclass is the +/// retrival of global metadata (size, addr) from the device +/// (\see getGlobalMetadataFromDevice). +class GlobalHandlerTy { + /// Actually move memory between host and device. See readGlobalFromDevice and + /// writeGlobalToDevice for the interface description. + int32_t moveGlobalBetweenDeviceAndHost(int32_t DeviceId, + const GlobalTy &HostGlobal, + bool Device2Host, void *Payload) { + GlobalTy DeviceGlobal(HostGlobal.getName(), HostGlobal.getSize()); + int32_t Err = getGlobalMetadataFromDevice(DeviceId, DeviceGlobal, Payload); + if (Err) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Failed to read global symbol metadata for '%s' from the device", + HostGlobal.getName().c_str()); + return Err; + } + + if (Device2Host) + Err = DeviceInterfaceTy::memcpyDtoH(DeviceId, HostGlobal.getPtr(), + DeviceGlobal.getPtr(), + HostGlobal.getSize()); + else + Err = DeviceInterfaceTy::memcpyHtoD(DeviceId, DeviceGlobal.getPtr(), + HostGlobal.getPtr(), + HostGlobal.getSize()); + + if (Err) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Failed to %s %u bytes associated with global symbol '%s' %s " + "the device", + Device2Host ? "read" : "write", HostGlobal.getSize(), + HostGlobal.getName().c_str(), Device2Host ? "from" : "to"); + return Err; + } + + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Successfully %s %u bytes associated with global symbol '%s' %s " + "the device (%p -> %p)", + Device2Host ? "read" : "write", HostGlobal.getSize(), + HostGlobal.getName().c_str(), Device2Host ? "from" : "to", + DeviceGlobal.getPtr(), HostGlobal.getPtr()); + return OFFLOAD_SUCCESS; + } + + /// Get the address and size of a global in the image. Return success + /// or failure. Address and size are return in \p ImageGlobal, the global name + /// is passed in \p ImageGlobal. + int32_t getGlobalMetadateFromImage(int32_t DeviceId, GlobalTy &ImageGlobal, + const char *ImageStart, + uint64_t ImageSize) { + // TODO: We should wrap ELF handling into a caching object. + MemoryBufferRef MBR(StringRef(ImageStart, ImageSize), "Image"); + Expected ELF = ELF64LEObjectFile::create(MBR); + if (!ELF) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, "Unable to open ELF image."); + return OFFLOAD_FAIL; + } + + // Then extract the base address of elf image. + Expected StartAddr = ELF.get().getStartAddress(); + if (!StartAddr) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Unable to determine ELF start address."); + return OFFLOAD_FAIL; + } + char *ELFStartAddr = reinterpret_cast(StartAddr.get()); + + for (auto &It : ELF.get().symbols()) { + // Fist check the name, continue if we don't match. + Expected Name = It.getName(); + if (!Name || !Name.get().equals(ImageGlobal.getName())) + continue; + + // If we match we will either succeed or fail with retriving the content, + // either way, the loop is done. First step is to verify the size. + ImageGlobal.setSize(It.getSize()); + + // Then extract the relative offset from the elf image base. + Expected Offset = It.getValue(); + if (!Offset) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Global symbol '%s' was found in the elf image but address could " + "not be determined.", + ImageGlobal.getName().c_str()); + return OFFLOAD_FAIL; + } + ImageGlobal.setPtr(ELFStartAddr + Offset.get()); + + return OFFLOAD_SUCCESS; + } + + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Global symbol '%s' was not found in the elf image.", + ImageGlobal.getName().c_str()); + return OFFLOAD_FAIL; + } + +public: + /// Copy the memory associated with a global from the host to its counterpart + /// on the device. The name, size, and destination are defined by + /// \p HostGlobal. Return success or failure. Payload is specific to the + /// plugin. + int32_t writeGlobalToDevice(int32_t DeviceId, const GlobalTy &HostGlobal, + void *Payload) { + return moveGlobalBetweenDeviceAndHost(DeviceId, HostGlobal, + /* Device2Host */ false, Payload); + } + + /// Read the memory associated with a global from the image and store it on + /// the host. The name, size, and destination are defined by \p HostGlobal. + /// Return success or failure. Payload is specific to the plugin. + int32_t readGlobalFromImage(int32_t DeviceId, const GlobalTy &HostGlobal, + const char *ImageStart, uint64_t ImageSize) { + GlobalTy ImageGlobal(HostGlobal.getName(), -1); + int32_t Err = getGlobalMetadateFromImage(DeviceId, ImageGlobal, ImageStart, + ImageSize); + if (Err) + return Err; + + if (ImageGlobal.getSize() != HostGlobal.getSize()) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Global symbol '%s' has %u bytes in the elf image but %u bytes " + "on the host, abort transfer.", + HostGlobal.getName().c_str(), ImageGlobal.getSize(), + HostGlobal.getSize()); + return OFFLOAD_FAIL; + } + + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Global symbol '%s' was found in the elf image and %u bytes will " + "copied from %p to %p.", + HostGlobal.getName().c_str(), HostGlobal.getSize(), + ImageGlobal.getPtr(), HostGlobal.getPtr()); + memcpy(HostGlobal.getPtr(), ImageGlobal.getPtr(), HostGlobal.getSize()); + return OFFLOAD_SUCCESS; + } + + /// Get the address and size of a global from the device. Return success + /// or failure. Address is return in \p DeviceGlobal, the global name and + /// expected size are passed in \p DeviceGlobal. Payload is specific to the + /// plugin. + int32_t getGlobalMetadataFromDevice(int32_t DeviceId, GlobalTy &DeviceGlobal, + void *Payload); + + /// Copy the memory associated with a global from the device to its + /// counterpart on the host. The name, size, and destination are defined by + /// \p HostGlobal. Return success or failure. Payload is specific to the + /// plugin. + int32_t readGlobalFromDevice(int32_t DeviceId, const GlobalTy &HostGlobal, + void *Payload) { + return moveGlobalBetweenDeviceAndHost(DeviceId, HostGlobal, + /* Device2Host */ true, Payload); + } +}; + +} // namespace plugin +} // namespace omp +} // namespace llvm + +#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_GLOBALHANDLER_GLOBALHANDLER_H diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt --- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt +++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt @@ -51,8 +51,10 @@ install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") target_link_libraries(omptarget.rtl.cuda + DeviceInterface elf_common MemoryManager + GlobalHandler ${LIBOMPTARGET_DEP_LIBRARIES} ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} ${OPENMP_PTHREAD_LIB} diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -27,10 +27,13 @@ #define TARGET_NAME CUDA #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" +#include "GlobalHandler.h" #include "MemoryManager.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" +using namespace llvm::omp::plugin; + // Utility for retrieving and printing CUDA error string. #ifdef OMPTARGET_DEBUG #define CUDA_ERR_STRING(err) \ @@ -322,6 +325,9 @@ }; class DeviceRTLTy { + /// The debug/configuration kind we read from LIBOMPTARGET_DEVICE_RTL_DEBUG + uint32_t DebugKind; + int NumberOfDevices; // OpenMP environment properties int EnvNumTeams; @@ -340,6 +346,7 @@ std::unique_ptr StreamManager; std::vector DeviceData; std::vector Modules; + GlobalHandlerTy GlobalHandler; /// A class responsible for interacting with device native runtime library to /// allocate and free memory. @@ -779,6 +786,8 @@ const __tgt_offload_entry *HostBegin = Image->EntriesBegin; const __tgt_offload_entry *HostEnd = Image->EntriesEnd; + // TODO: This is basically the same in the AMDGPU and CUDA plugin, + // refactor. std::list &KernelsList = DeviceData[DeviceId].KernelsList; for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { if (!E->addr) { @@ -791,26 +800,11 @@ if (E->size) { __tgt_offload_entry Entry = *E; - CUdeviceptr CUPtr; - size_t CUSize; - Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); - // We keep this style here because we need the name - if (Err != CUDA_SUCCESS) { - REPORT("Loading global '%s' Failed\n", E->name); - CUDA_ERR_STRING(Err); + StaticGlobalTy Global(E->name); + if (!GlobalHandler.getGlobalMetadataFromDevice(DeviceId, Global, + Module)) return nullptr; - } - - if (CUSize != E->size) { - DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, - CUSize, E->size); - return nullptr; - } - - DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", - DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); - - Entry.addr = (void *)(CUPtr); + Entry.addr = Global.getPtr(); // Note: In the current implementation declare target variables // can either be link or to. This means that once unified @@ -825,10 +819,9 @@ // If unified memory is present any target link or to variables // can access host addresses directly. There is no longer a // need for device copies. - cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); - DP("Copy linked variable host address (" DPxMOD - ") to device address (" DPxMOD ")\n", - DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); + Global.setValue(E->addr); + if (!GlobalHandler.writeGlobalToDevice(DeviceId, Global, Module)) + return nullptr; } addOffloadEntry(DeviceId, Entry); @@ -848,37 +841,18 @@ DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); - // default value GENERIC (in case symbol is missing from cubin file) - llvm::omp::OMPTgtExecModeFlags ExecModeVal; - std::string ExecModeNameStr(E->name); - ExecModeNameStr += "_exec_mode"; - const char *ExecModeName = ExecModeNameStr.c_str(); - - CUdeviceptr ExecModePtr; - size_t CUSize; - Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); - if (Err == CUDA_SUCCESS) { - if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { - DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", - ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); - return nullptr; - } - - Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); - if (Err != CUDA_SUCCESS) { - REPORT("Error when copying data from device to host. Pointers: " - "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", - DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); - CUDA_ERR_STRING(Err); - return nullptr; - } - } else { - DP("Loading global exec_mode '%s' - symbol missing, using default " - "value GENERIC (1)\n", - ExecModeName); + StaticGlobalTy ExecModeGlobal( + E->name, "_exec_mode"); + // TODO: We should be able to read it from the image instead. + if (!GlobalHandler.readGlobalFromDevice(DeviceId, ExecModeGlobal, + Module)) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Failed to read execution mode for %s, defaulting to SPMD.", + E->name); + ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_SPMD); } - KernelsList.emplace_back(Func, ExecModeVal); + KernelsList.emplace_back(Func, ExecModeGlobal.getValue()); __tgt_offload_entry Entry = *E; Entry.addr = &KernelsList.back(); @@ -887,43 +861,20 @@ // send device environment data to the device { - // TODO: The device ID used here is not the real device ID used by OpenMP. - DeviceEnvironmentTy DeviceEnv{0, static_cast(NumberOfDevices), - static_cast(DeviceId), - static_cast(DynamicMemorySize)}; - if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) - DeviceEnv.DebugKind = std::stoi(EnvStr); - - const char *DeviceEnvName = "omptarget_device_environment"; - CUdeviceptr DeviceEnvPtr; - size_t CUSize; - - Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); - if (Err == CUDA_SUCCESS) { - if (CUSize != sizeof(DeviceEnv)) { - REPORT( - "Global device_environment '%s' - size mismatch (%zu != %zu)\n", - DeviceEnvName, CUSize, sizeof(int32_t)); - CUDA_ERR_STRING(Err); - return nullptr; - } - - Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); - if (Err != CUDA_SUCCESS) { - REPORT("Error when copying data from host to device. Pointers: " - "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", - DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); - CUDA_ERR_STRING(Err); - return nullptr; - } + DebugKind = std::stoi(EnvStr); - DP("Sending global device environment data %zu bytes\n", CUSize); - } else { - DP("Finding global device environment '%s' - symbol missing.\n", - DeviceEnvName); - DP("Continue, considering this is a device RTL which does not accept " - "environment setting.\n"); + // TODO: The device ID used here is not the real device ID used by OpenMP. + StaticGlobalTy DeviceEnvGlobal( + "omptarget_device_environment", DebugKind, + static_cast(NumberOfDevices), + static_cast(DeviceId), + static_cast(DynamicMemorySize)); + if (!GlobalHandler.writeGlobalToDevice(DeviceId, DeviceEnvGlobal, + Module)) { + // TODO: I think this should be fatal. + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Failed to write device environment, continue without."); } } @@ -1405,6 +1356,30 @@ DeviceRTLTy DeviceRTL; } // namespace +int32_t llvm::omp::plugin::GlobalHandlerTy::getGlobalMetadataFromDevice( + int32_t DeviceId, GlobalTy &DeviceGlobal, void *ModulePtr) { + CUmodule Module = static_cast(ModulePtr); + CUdeviceptr CUPtr; + size_t CUSize; + const char *Name = DeviceGlobal.getName().c_str(); + CUresult Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, Name); + if (Err != CUDA_SUCCESS) { + REPORT("Loading global '%s' Failed\n", Name); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + if (CUSize != DeviceGlobal.getSize()) { + DP("Loading global '%s' - size mismatch (%zd != %zd)\n", Name, CUSize, + DeviceGlobal.getSize()); + return OFFLOAD_FAIL; + } + + DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", + DPxPTR(E - HostBegin), Name, DPxPTR(CUPtr)); + DeviceGlobal.setPtr(reinterpret_cast(CUPtr)); + return OFFLOAD_SUCCESS; +} + // Exposed library API function #ifdef __cplusplus extern "C" {