diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -44,10 +44,19 @@ using namespace llvm::omp::plugin; +struct GenericPluginAPIPayloadTy { + SymbolInfoTableTy *SymbolInfoTable = nullptr; + KernelInfoTableTy *KernelInfoTable = nullptr; + GenericPluginAPIPayloadTy(SymbolInfoTableTy &SIT) : SymbolInfoTable(&SIT) {} + GenericPluginAPIPayloadTy(KernelInfoTableTy &KIT) : KernelInfoTable(&KIT) {} + GenericPluginAPIPayloadTy(SymbolInfoTableTy &SIT, KernelInfoTableTy &KIT) + : SymbolInfoTable(&SIT), KernelInfoTable(&KIT) {} +}; + int32_t llvm::omp::plugin::GlobalHandlerTy::getGlobalMetadataFromDevice( - int32_t DeviceId, GlobalTy &DeviceGlobal, void *SymbolInfoTablePtr) { + int32_t DeviceId, GlobalTy &DeviceGlobal, void *Payload) { SymbolInfoTableTy &SymbolInfoTable = - *static_cast(SymbolInfoTablePtr); + *static_cast(Payload)->SymbolInfoTable; void *DevPtr; unsigned DevSize; const char *Name = DeviceGlobal.getName().c_str(); @@ -269,10 +278,6 @@ } }; -/// List that contains all the kernels. -/// FIXME: we may need this to be per device and per library. -std::list KernelsList; - template static hsa_status_t FindAgents(Callback CB) { hsa_status_t err = @@ -398,7 +403,9 @@ }; /// Class containing all the device information -class RTLDeviceInfoTy { +class RTLDeviceInfoTy final : public DeviceInterfaceTy { + std::vector> KernelLists; + HSALifetime HSA; // First field => constructed first and destructed last std::vector> FuncGblEntries; @@ -526,7 +533,8 @@ } // Record entry point associated with device - void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { + void addOffloadEntry(int32_t device_id, + const __tgt_offload_entry entry) override { assert(device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); @@ -534,6 +542,81 @@ E.Entries.push_back(entry); } + GlobalHandlerTy *getGlobalHandler() override { return &GlobalHandler; } + + int64_t getRequiresFlags() override { return RequiresFlags; } + + void *constructKernelEntry(int32_t DeviceId, const __tgt_device_image *Image, + const __tgt_offload_entry *KernelEntry, + void *Payload) override { + GenericPluginAPIPayloadTy &GPPayload = + *static_cast(Payload); + KernelInfoTableTy &KernelInfoTable = *GPPayload.KernelInfoTable; + + uint32_t kernarg_segment_size; + hsa_status_t err = interop_hsa_get_kernel_info( + KernelInfoTable, DeviceId, KernelEntry->name, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, + &kernarg_segment_size); + (void)err; + + // get flat group size if present, else Default_WG_Size + int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; + + // get Kernel Descriptor if present. + // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp + struct KernDescValType { + uint16_t Version; + uint16_t TSize; + uint16_t WG_Size; + }; + + const size_t ImageSize = + (char *)Image->ImageEnd - (char *)Image->ImageStart; + + // Read the kernel description global from the binary. + StaticGlobalTy KernelDescGlobal(KernelEntry->name, + "_kern_desc"); + if (!GlobalHandler.readGlobalFromImage( + DeviceId, KernelDescGlobal, (char *)Image->ImageStart, ImageSize)) { + // No kernel description available, fallback to work group size global: + // Read work group size global from the binary. + StaticGlobalTy WGSizeGlobal(KernelEntry->name, "_wg_size"); + if (!GlobalHandler.readGlobalFromImage( + DeviceId, WGSizeGlobal, (char *)Image->ImageStart, ImageSize)) { + WGSizeGlobal.setValue(WGSizeVal); + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Failed to work group size for %s, defaulting to %i.", + KernelEntry->name, WGSizeVal); + } + WGSizeVal = WGSizeGlobal.getValue(); + } else if (KernelDescGlobal.getValue().WG_Size) { + if (sizeof(KernDescValType) != KernelDescGlobal.getValue().TSize) + DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", + sizeof(KernDescValType), KernelDescGlobal.getValue().TSize, + KernelDescGlobal.getName().c_str()); + WGSizeVal = KernelDescGlobal.getValue().WG_Size; + } + + // Read execution mode global from the binary + StaticGlobalTy ExecModeGlobal( + KernelEntry->name, "_exec_mode"); + if (!GlobalHandler.readGlobalFromImage( + DeviceId, ExecModeGlobal, (char *)Image->ImageStart, ImageSize)) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Failed to read execution mode for %s, defaulting to SPMD.", + KernelEntry->name); + ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_SPMD); + } + + auto &KernelList = KernelLists[DeviceId]; + void *CallStackAddr = nullptr; + KernelList.push_back(KernelTy(ExecModeGlobal.getValue(), WGSizeVal, + DeviceId, CallStackAddr, KernelEntry->name, + kernarg_segment_size, KernArgPool)); + return &KernelList.back(); + } + // Return true if the entry is associated with device bool findOffloadEntry(int32_t device_id, void *addr) { assert(device_id < (int32_t)FuncGblEntries.size() && @@ -733,6 +816,8 @@ DP("There are %d devices supporting HSA.\n", NumberOfDevices); } + initDeviceInterface(NumberOfDevices); + // Init the device info HSAQueues.resize(NumberOfDevices); FuncGblEntries.resize(NumberOfDevices); @@ -1182,7 +1267,7 @@ // - Write the pointer to the symbol omptarget_nvptx_device_State // // - Pulls some per-kernel information together from various sources and - // records it in the KernelsList for quicker access later + // records it in the KernelList for quicker access later // // The initialization can be done before or after loading the image onto the // gpu. This function presently does a mixture. Using the hsa api to get/set @@ -1201,29 +1286,10 @@ return NULL; } - // send device environment data to the device - { - uint32_t DynamicMemorySize = 0; - uint32_t DebugKind = 0; - if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) - DebugKind = std::stoi(EnvStr); - - // TODO: The device ID used here is not the real device ID used by OpenMP. - StaticGlobalTy DeviceEnvGlobal( - "omptarget_device_environment", DebugKind, - static_cast(DeviceInfo.NumberOfDevices), - static_cast(device_id), - static_cast(DynamicMemorySize)); - auto &SymbolInfoTable = DeviceInfo.SymbolInfoTables[device_id]; - // TODO: Implement "writeGlobalToImage" in the GlobalHandler. - if (!DeviceInfo.GlobalHandler.writeGlobalToDevice( - device_id, DeviceEnvGlobal, &SymbolInfoTable)) { - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, device_id, - "Failed to write device environment, abort."); - // TODO: Check the device gfx name against the image gfx name. - return nullptr; - } - } + GenericPluginAPIPayloadTy Payload(DeviceInfo.SymbolInfoTables[device_id], + DeviceInfo.KernelInfoTables[device_id]); + if (!DeviceInfo.setupDeviceEnvironment(device_id, &Payload)) + return nullptr; DP("AMDGPU module successfully loaded!\n"); @@ -1232,9 +1298,8 @@ // needs to be assigned to a pointer to an array of size device_state_bytes // If absent, it has been deadstripped and needs no setup. StaticGlobalTy DeviceStateGlobal("omptarget_nvptx_device_State"); - auto &SymbolInfoMap = DeviceInfo.SymbolInfoTables[device_id]; if (!DeviceInfo.GlobalHandler.getGlobalMetadataFromDevice( - device_id, DeviceStateGlobal, &SymbolInfoMap)) { + device_id, DeviceStateGlobal, &Payload)) { DP("No device_state pointer symbol found, skipping initialization\n"); } else { StaticGlobalTy DeviceStateSizeGlobal( @@ -1270,134 +1335,15 @@ // write ptr to device memory so it can be used by later kernels DeviceStateGlobal.setValue(DSS.first.get()); if (!DeviceInfo.GlobalHandler.writeGlobalToDevice( - device_id, DeviceStateGlobal, &SymbolInfoMap)) { + device_id, DeviceStateGlobal, &Payload)) { DP("memcpy install of state_ptr failed\n"); return NULL; } } } - // Here, we take advantage of the data that is appended after img_end to get - // the symbols' name we need to load. This data consist of the host entries - // begin and end as well as the target name (see the offloading linker script - // creation in clang compiler). - - // Find the symbols in the module by name. The name can be obtain by - // concatenating the host entry name with the target name - - __tgt_offload_entry *HostBegin = image->EntriesBegin; - __tgt_offload_entry *HostEnd = image->EntriesEnd; - - // TODO: This is basically the same in the AMDGPU and CUDA plugin, - // refactor. - for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { - - if (!e->addr) { - // The host should have always something in the address to - // uniquely identify the target region. - DP("Analyzing host entry '' (size = %lld)...\n", - (unsigned long long)e->size); - return NULL; - } - - if (e->size) { - __tgt_offload_entry entry = *e; - - StaticGlobalTy Global(e->name); - auto &SymbolInfoMap = DeviceInfo.SymbolInfoTables[device_id]; - if (!DeviceInfo.GlobalHandler.getGlobalMetadataFromDevice( - device_id, Global, &SymbolInfoMap)) - return nullptr; - entry.addr = Global.getPtr(); - - DeviceInfo.addOffloadEntry(device_id, entry); - - if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && - e->flags & OMP_DECLARE_TARGET_LINK) { - // If unified memory is present any target link variables - // can access host addresses directly. There is no longer a - // need for device copies. - Global.setValue(e->addr); - if (!DeviceInfo.GlobalHandler.writeGlobalToDevice(device_id, Global, - &SymbolInfoMap)) - return nullptr; - } - - continue; - } - - DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name)); - - uint32_t kernarg_segment_size; - auto &KernelInfoMap = DeviceInfo.KernelInfoTables[device_id]; - hsa_status_t err = interop_hsa_get_kernel_info( - KernelInfoMap, device_id, e->name, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, - &kernarg_segment_size); - (void)err; - - // each arg is a void * in this openmp implementation - uint32_t arg_num = kernarg_segment_size / sizeof(void *); - std::vector arg_sizes(arg_num); - for (std::vector::iterator it = arg_sizes.begin(); - it != arg_sizes.end(); it++) { - *it = sizeof(void *); - } - - // get flat group size if present, else Default_WG_Size - int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; - - // get Kernel Descriptor if present. - // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp - struct KernDescValType { - uint16_t Version; - uint16_t TSize; - uint16_t WG_Size; - }; - - // Read the kernel description global from the binary. - StaticGlobalTy KernelDescGlobal(e->name, "_kern_desc"); - if (!DeviceInfo.GlobalHandler.readGlobalFromImage( - device_id, KernelDescGlobal, (char *)image->ImageStart, img_size)) { - // No kernel description available, fallback to work group size global: - // Read work group size global from the binary. - StaticGlobalTy WGSizeGlobal(e->name, "_wg_size"); - if (!DeviceInfo.GlobalHandler.readGlobalFromImage( - device_id, WGSizeGlobal, (char *)image->ImageStart, img_size)) { - WGSizeGlobal.setValue(WGSizeVal); - INFO(OMP_INFOTYPE_DATA_TRANSFER, device_id, - "Failed to work group size for %s, defaulting to %i.", e->name, - WGSizeVal); - } - WGSizeVal = WGSizeGlobal.getValue(); - } else if (KernelDescGlobal.getValue().WG_Size) { - if (sizeof(KernDescValType) != KernelDescGlobal.getValue().TSize) - DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", - sizeof(KernDescValType), KernelDescGlobal.getValue().TSize, - KernelDescGlobal.getName().c_str()); - WGSizeVal = KernelDescGlobal.getValue().WG_Size; - } - - // Read execution mode global from the binary - StaticGlobalTy ExecModeGlobal(e->name, - "_exec_mode"); - if (!DeviceInfo.GlobalHandler.readGlobalFromImage( - device_id, ExecModeGlobal, (char *)image->ImageStart, img_size)) { - INFO(OMP_INFOTYPE_DATA_TRANSFER, device_id, - "Failed to read execution mode for %s, defaulting to SPMD.", - e->name); - ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_SPMD); - } - - void *CallStackAddr = nullptr; - KernelsList.push_back( - KernelTy(ExecModeGlobal.getValue(), WGSizeVal, device_id, CallStackAddr, - e->name, kernarg_segment_size, DeviceInfo.KernArgPool)); - __tgt_offload_entry entry = *e; - entry.addr = (void *)&KernelsList.back(); - DeviceInfo.addOffloadEntry(device_id, entry); - DP("Entry point %ld maps to %s\n", e - HostBegin, e->name); - } + if (!DeviceInfo.registerOffloadEntries(device_id, image, &Payload)) + return nullptr; return DeviceInfo.getOffloadEntriesTable(device_id); } diff --git a/openmp/libomptarget/plugins/common/CMakeLists.txt b/openmp/libomptarget/plugins/common/CMakeLists.txt --- a/openmp/libomptarget/plugins/common/CMakeLists.txt +++ b/openmp/libomptarget/plugins/common/CMakeLists.txt @@ -14,3 +14,4 @@ add_subdirectory(elf_common) add_subdirectory(MemoryManager) add_subdirectory(GlobalHandler) +add_subdirectory(KernelHandler) diff --git a/openmp/libomptarget/plugins/common/DeviceInterface/CMakeLists.txt b/openmp/libomptarget/plugins/common/DeviceInterface/CMakeLists.txt --- a/openmp/libomptarget/plugins/common/DeviceInterface/CMakeLists.txt +++ b/openmp/libomptarget/plugins/common/DeviceInterface/CMakeLists.txt @@ -6,6 +6,17 @@ # ##===----------------------------------------------------------------------===## -add_library(DeviceInterface INTERFACE) +add_library(DeviceInterface OBJECT DeviceInterface.cpp) + +# Build DeviceInterface with PIC to be able to link it with plugin shared libraries. +set_property(TARGET DeviceInterface PROPERTY POSITION_INDEPENDENT_CODE ON) +llvm_update_compile_flags(DeviceInterface) +set(LINK_LLVM_LIBS LLVMSupport LLVMFrontendOpenMP) +if (LLVM_LINK_LLVM_DYLIB) + set(LINK_LLVM_LIBS LLVM) +endif() +target_link_libraries(DeviceInterface PRIVATE GlobalHandler ${LINK_LLVM_LIBS}) +include_directories(${LIBOMPTARGET_LLVM_INCLUDE_DIRS}) +add_dependencies(DeviceInterface ${LINK_LLVM_LIBS}) target_include_directories(DeviceInterface INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/openmp/libomptarget/plugins/common/DeviceInterface/DeviceInterface.h b/openmp/libomptarget/plugins/common/DeviceInterface/DeviceInterface.h --- a/openmp/libomptarget/plugins/common/DeviceInterface/DeviceInterface.h +++ b/openmp/libomptarget/plugins/common/DeviceInterface/DeviceInterface.h @@ -11,34 +11,67 @@ #ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_DEVICEINTERFACE_DEVICEINTERFACE_H #define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_DEVICEINTERFACE_DEVICEINTERFACE_H -#include "omptarget.h" +#include +#include -extern "C" { -int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, - int64_t Size); -int32_t __tgt_rtl_data_submit_async(int32_t ID, void *TargetPtr, void *HostPtr, - int64_t Size, __tgt_async_info *AsyncInfo); -int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, - int64_t Size); -int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr, - void *TargetPtr, int64_t Size, - __tgt_async_info *AsyncInfo); -} +#include "Debug.h" +#include "DeviceEnvironment.h" +#include "omptarget.h" namespace llvm { namespace omp { namespace plugin { +int32_t memcpyDtoH(int32_t DeviceId, void *Dst, const void *Src, int32_t Size); +int32_t memcpyHtoD(int32_t DeviceId, void *Dst, const void *Src, int32_t Size); + +class GlobalHandlerTy; + struct DeviceInterfaceTy { - static int32_t memcpyDtoH(int32_t DeviceId, void *Dst, const void *Src, - int32_t Size) { - return __tgt_rtl_data_retrieve(DeviceId, Dst, const_cast(Src), - Size); - } - static int32_t memcpyHtoD(int32_t DeviceId, void *Dst, const void *Src, - int32_t Size) { - return __tgt_rtl_data_submit(DeviceId, Dst, const_cast(Src), Size); + DeviceEnvironmentTy DeviceEnvironment; + + virtual ~DeviceInterfaceTy() {} + + int32_t setupDeviceEnvironment(int32_t DeviceId, void *Payload); + + int32_t registerOffloadEntries(int32_t DeviceId, + const __tgt_device_image *Image, + void *Payload); + +protected: + void initDeviceInterface(int32_t NumDevices) { + DeviceEnvironment.NumDevices = NumDevices; + + if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { + DeviceEnvironment.DebugKind = std::stoi(EnvStr); + } + if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) { + DeviceEnvironment.DynamicMemSize = std::stoi(EnvStr); + DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n", + DeviceEnvironment.DynamicMemSize); + } } + + virtual void addOffloadEntry(int32_t DeviceId, + const __tgt_offload_entry Entry) = 0; + virtual GlobalHandlerTy *getGlobalHandler() = 0; + + virtual int64_t getRequiresFlags() = 0; + + virtual void *constructKernelEntry(int32_t DeviceId, + const __tgt_device_image *Image, + const __tgt_offload_entry *KernelEntry, + void *Payload) = 0; + +private: + int32_t registerGlobalOffloadEntry(int32_t DeviceId, + const __tgt_device_image *Image, + const __tgt_offload_entry *GlobalEntry, + void *Payload); + int32_t registerKernelOffloadEntry(int32_t DeviceId, + const __tgt_device_image *Image, + const __tgt_offload_entry *KernelEntry, + void *Payload); }; } // namespace plugin diff --git a/openmp/libomptarget/plugins/common/DeviceInterface/DeviceInterface.cpp b/openmp/libomptarget/plugins/common/DeviceInterface/DeviceInterface.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/common/DeviceInterface/DeviceInterface.cpp @@ -0,0 +1,127 @@ +//===- DeviceInterface.cpp - Target independent plugin device interface ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "DeviceInterface.h" +#include "Debug.h" +#include "GlobalHandler.h" +#include "omptarget.h" + +extern "C" { +int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, + int64_t Size); +int32_t __tgt_rtl_data_submit_async(int32_t ID, void *TargetPtr, void *HostPtr, + int64_t Size, __tgt_async_info *AsyncInfo); +int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, + int64_t Size); +int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr, + void *TargetPtr, int64_t Size, + __tgt_async_info *AsyncInfo); +} + +using namespace llvm; +using namespace omp; +using namespace plugin; + +int32_t DeviceInterfaceTy::setupDeviceEnvironment(int32_t DeviceId, + void *Payload) { + + // TODO: The device ID used here is not the real device ID used by OpenMP. + DeviceEnvironment.DeviceNum = DeviceId; + + GlobalHandlerTy &GlobalHandler = *getGlobalHandler(); + GlobalTy DeviceEnvGlobal("omptarget_device_environment", + sizeof(DeviceEnvironmentTy), &DeviceEnvironment); + // TODO: Implement "writeGlobalToImage" in the GlobalHandler. + if (!GlobalHandler.writeGlobalToDevice(DeviceId, DeviceEnvGlobal, Payload)) { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Failed to write device environment, abort."); + // TODO: Check the device gfx name against the image gfx name. + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t DeviceInterfaceTy::registerOffloadEntries( + int32_t DeviceId, const __tgt_device_image *Image, void *Payload) { + const __tgt_offload_entry *Begin = Image->EntriesBegin; + const __tgt_offload_entry *End = Image->EntriesEnd; + for (const __tgt_offload_entry *Entry = Begin; Entry != End; ++Entry) { + if (!Entry->addr) { + // The host should have always something in the address to + // uniquely identify the entry. + INFO(OMP_INFOTYPE_ALL, DeviceId, + "Unexpected host entry without address (size: %ld), abort!\n", + Entry->size); + return OFFLOAD_FAIL; + } + + if (Entry->size) { + if (!registerGlobalOffloadEntry(DeviceId, Image, Entry, Payload)) + return OFFLOAD_FAIL; + } else { + if (!registerKernelOffloadEntry(DeviceId, Image, Entry, Payload)) + return OFFLOAD_FAIL; + } + } + return OFFLOAD_SUCCESS; +} + +int32_t DeviceInterfaceTy::registerGlobalOffloadEntry( + int32_t DeviceId, const __tgt_device_image *Image, + const __tgt_offload_entry *GlobalEntry, void *Payload) { + + GlobalHandlerTy &GlobalHandler = *getGlobalHandler(); + __tgt_offload_entry DeviceEntry = *GlobalEntry; + StaticGlobalTy Global(GlobalEntry->name); + if (!GlobalHandler.getGlobalMetadataFromDevice(DeviceId, Global, Payload)) + return OFFLOAD_FAIL; + DeviceEntry.addr = Global.getPtr(); + + // Note: In the current implementation declare target variables + // can either be link or to. This means that once unified + // memory is activated via the requires directive, the variable + // can be used directly from the host in both cases. + // TODO: when variables types other than to or link are added, + // the below condition should be changed to explicitly + // check for to and link variables types. + if (getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY && + (GlobalEntry->flags & OMP_DECLARE_TARGET_LINK)) { + // If unified memory is present any target link or to variables + // can access host addresses directly. There is no longer a + // need for device copies. + Global.setValue(GlobalEntry->addr); + if (!GlobalHandler.writeGlobalToDevice(DeviceId, Global, Payload)) + return OFFLOAD_FAIL; + } + + addOffloadEntry(DeviceId, DeviceEntry); + return OFFLOAD_SUCCESS; +} + +int32_t DeviceInterfaceTy::registerKernelOffloadEntry( + int32_t DeviceId, const __tgt_device_image *Image, + const __tgt_offload_entry *KernelEntry, void *Payload) { + __tgt_offload_entry DeviceEntry = *KernelEntry; + void *Kernel = constructKernelEntry(DeviceId, Image, KernelEntry, Payload); + if (!Kernel) + return OFFLOAD_FAIL; + DeviceEntry.addr = (void *)Kernel; + addOffloadEntry(DeviceId, DeviceEntry); + return OFFLOAD_SUCCESS; +} + +int32_t llvm::omp::plugin::memcpyDtoH(int32_t DeviceId, void *Dst, + const void *Src, int32_t Size) { + return __tgt_rtl_data_retrieve(DeviceId, Dst, const_cast(Src), Size); +} +int32_t llvm::omp::plugin::memcpyHtoD(int32_t DeviceId, void *Dst, + const void *Src, int32_t Size) { + return __tgt_rtl_data_submit(DeviceId, Dst, const_cast(Src), Size); +} diff --git a/openmp/libomptarget/plugins/common/GlobalHandler/CMakeLists.txt b/openmp/libomptarget/plugins/common/GlobalHandler/CMakeLists.txt --- a/openmp/libomptarget/plugins/common/GlobalHandler/CMakeLists.txt +++ b/openmp/libomptarget/plugins/common/GlobalHandler/CMakeLists.txt @@ -8,4 +8,5 @@ add_library(GlobalHandler INTERFACE) +target_link_libraries(GlobalHandler INTERFACE DeviceInterface) target_include_directories(GlobalHandler INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/openmp/libomptarget/plugins/common/GlobalHandler/GlobalHandler.h b/openmp/libomptarget/plugins/common/GlobalHandler/GlobalHandler.h --- a/openmp/libomptarget/plugins/common/GlobalHandler/GlobalHandler.h +++ b/openmp/libomptarget/plugins/common/GlobalHandler/GlobalHandler.h @@ -109,13 +109,11 @@ } if (Device2Host) - Err = DeviceInterfaceTy::memcpyDtoH(DeviceId, HostGlobal.getPtr(), - DeviceGlobal.getPtr(), - HostGlobal.getSize()); + Err = memcpyDtoH(DeviceId, HostGlobal.getPtr(), DeviceGlobal.getPtr(), + HostGlobal.getSize()); else - Err = DeviceInterfaceTy::memcpyHtoD(DeviceId, DeviceGlobal.getPtr(), - HostGlobal.getPtr(), - HostGlobal.getSize()); + Err = memcpyHtoD(DeviceId, DeviceGlobal.getPtr(), HostGlobal.getPtr(), + HostGlobal.getSize()); if (Err) { INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -324,9 +324,7 @@ } }; -class DeviceRTLTy { - /// The debug/configuration kind we read from LIBOMPTARGET_DEVICE_RTL_DEBUG - uint32_t DebugKind; +class DeviceRTLTy final : public DeviceInterfaceTy { int NumberOfDevices; // OpenMP environment properties @@ -335,8 +333,6 @@ int EnvTeamThreadLimit; // OpenMP requires flags int64_t RequiresFlags; - // Amount of dynamic shared memory to use at launch. - uint64_t DynamicMemorySize; static constexpr const int HardTeamLimit = 1U << 16U; // 64k static constexpr const int HardThreadLimit = 1024; @@ -437,11 +433,47 @@ bool UseMemoryManager = true; // Record entry point associated with device - void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { + void addOffloadEntry(int32_t DeviceId, + const __tgt_offload_entry entry) override { FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); E.Entries.push_back(entry); } + GlobalHandlerTy *getGlobalHandler() override { return &GlobalHandler; } + + int64_t getRequiresFlags() override { return RequiresFlags; } + + void *constructKernelEntry(int32_t DeviceId, const __tgt_device_image *Image, + const __tgt_offload_entry *KernelEntry, + void *Payload) override { + CUmodule Module = static_cast(Payload); + CUfunction Func; + CUresult Err = cuModuleGetFunction(&Func, Module, KernelEntry->name); + // We keep this style here because we need the name + if (Err != CUDA_SUCCESS) { + REPORT("Loading '%s' Failed\n", KernelEntry->name); + CUDA_ERR_STRING(Err); + return nullptr; + } + + DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", DPxPTR(KernelEntry), + KernelEntry->name, DPxPTR(Func)); + + StaticGlobalTy ExecModeGlobal( + KernelEntry->name, "_exec_mode"); + // TODO: We should be able to read it from the image instead. + if (!GlobalHandler.readGlobalFromDevice(DeviceId, ExecModeGlobal, Module)) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Failed to read execution mode for %s, defaulting to SPMD.", + KernelEntry->name); + ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_SPMD); + } + + auto &KernelList = DeviceData[DeviceId].KernelsList; + KernelList.emplace_back(Func, ExecModeGlobal.getValue()); + return &KernelList.back(); + } + // Return a pointer to the entry associated with the pointer const __tgt_offload_entry *getOffloadEntry(const int DeviceId, const void *Addr) const { @@ -491,8 +523,7 @@ DeviceRTLTy() : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), - EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), - DynamicMemorySize(0) { + EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) { DP("Start initializing CUDA\n"); @@ -515,6 +546,7 @@ return; } + initDeviceInterface(NumberOfDevices); DeviceData.resize(NumberOfDevices); // Get environment variables regarding teams @@ -533,12 +565,6 @@ EnvNumTeams = std::stoi(EnvStr); DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); } - if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) { - // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set - DynamicMemorySize = std::stoi(EnvStr); - DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n", - DynamicMemorySize); - } StreamManager = std::make_unique(NumberOfDevices, DeviceData); @@ -782,101 +808,11 @@ Modules.push_back(Module); - // Find the symbols in the module by name. - const __tgt_offload_entry *HostBegin = Image->EntriesBegin; - const __tgt_offload_entry *HostEnd = Image->EntriesEnd; - - // TODO: This is basically the same in the AMDGPU and CUDA plugin, - // refactor. - std::list &KernelsList = DeviceData[DeviceId].KernelsList; - for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { - if (!E->addr) { - // We return nullptr when something like this happens, the host should - // have always something in the address to uniquely identify the target - // region. - DP("Invalid binary: host entry '' (size = %zd)...\n", E->size); - return nullptr; - } - - if (E->size) { - __tgt_offload_entry Entry = *E; - StaticGlobalTy Global(E->name); - if (!GlobalHandler.getGlobalMetadataFromDevice(DeviceId, Global, - Module)) - return nullptr; - Entry.addr = Global.getPtr(); - - // Note: In the current implementation declare target variables - // can either be link or to. This means that once unified - // memory is activated via the requires directive, the variable - // can be used directly from the host in both cases. - // TODO: when variables types other than to or link are added, - // the below condition should be changed to explicitly - // check for to and link variables types: - // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & - // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) - if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { - // If unified memory is present any target link or to variables - // can access host addresses directly. There is no longer a - // need for device copies. - Global.setValue(E->addr); - if (!GlobalHandler.writeGlobalToDevice(DeviceId, Global, Module)) - return nullptr; - } - - addOffloadEntry(DeviceId, Entry); - - continue; - } - - CUfunction Func; - Err = cuModuleGetFunction(&Func, Module, E->name); - // We keep this style here because we need the name - if (Err != CUDA_SUCCESS) { - REPORT("Loading '%s' Failed\n", E->name); - CUDA_ERR_STRING(Err); - return nullptr; - } - - DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", - DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); - - StaticGlobalTy ExecModeGlobal( - E->name, "_exec_mode"); - // TODO: We should be able to read it from the image instead. - if (!GlobalHandler.readGlobalFromDevice(DeviceId, ExecModeGlobal, - Module)) { - INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, - "Failed to read execution mode for %s, defaulting to SPMD.", - E->name); - ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_SPMD); - } - - KernelsList.emplace_back(Func, ExecModeGlobal.getValue()); - - __tgt_offload_entry Entry = *E; - Entry.addr = &KernelsList.back(); - addOffloadEntry(DeviceId, Entry); - } + if (!setupDeviceEnvironment(DeviceId, Module)) + return nullptr; - // send device environment data to the device - { - if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) - DebugKind = std::stoi(EnvStr); - - // TODO: The device ID used here is not the real device ID used by OpenMP. - StaticGlobalTy DeviceEnvGlobal( - "omptarget_device_environment", DebugKind, - static_cast(NumberOfDevices), - static_cast(DeviceId), - static_cast(DynamicMemorySize)); - if (!GlobalHandler.writeGlobalToDevice(DeviceId, DeviceEnvGlobal, - Module)) { - // TODO: I think this should be fatal. - INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, - "Failed to write device environment, continue without."); - } - } + if (!registerOffloadEntries(DeviceId, Image, Module)) + return nullptr; return getOffloadEntriesTable(DeviceId); } @@ -1129,7 +1065,8 @@ Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, /* gridDimZ */ 1, CudaThreadsPerBlock, /* blockDimY */ 1, /* blockDimZ */ 1, - DynamicMemorySize, Stream, &Args[0], nullptr); + DeviceEnvironment.DynamicMemSize, Stream, &Args[0], + nullptr); if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) return OFFLOAD_FAIL; @@ -1376,7 +1313,7 @@ DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", DPxPTR(E - HostBegin), Name, DPxPTR(CUPtr)); - DeviceGlobal.setPtr(reinterpret_cast(CUPtr)); + DeviceGlobal.setPtr(reinterpret_cast(CUPtr)); return OFFLOAD_SUCCESS; }