diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -34,6 +34,7 @@ #include "DeviceEnvironment.h" #include "GlobalHandler.h" +#include "KernelHandler.h" #include "get_elf_mach_gfx_name.h" #include "omptargetplugin.h" @@ -44,10 +45,17 @@ using namespace llvm::omp::plugin; +struct GenericPluginAPIPayloadTy { + SymbolInfoTableTy *SymbolInfoTable = nullptr; + KernelInfoTableTy *KernelInfoTable = nullptr; + GenericPluginAPIPayloadTy(SymbolInfoTableTy &SIT) : SymbolInfoTable(&SIT) {} + GenericPluginAPIPayloadTy(KernelInfoTableTy &KIT) : KernelInfoTable(&KIT) {} +}; + int32_t llvm::omp::plugin::GlobalHandlerTy::getGlobalMetadataFromDevice( - int32_t DeviceId, GlobalTy &DeviceGlobal, void *SymbolInfoTablePtr) { + int32_t DeviceId, GlobalTy &DeviceGlobal, void *Payload) { SymbolInfoTableTy &SymbolInfoTable = - *static_cast(SymbolInfoTablePtr); + *static_cast(Payload)->SymbolInfoTable; void *DevPtr; unsigned DevSize; const char *Name = DeviceGlobal.getName().c_str(); @@ -269,10 +277,6 @@ } }; -/// List that contains all the kernels. -/// FIXME: we may need this to be per device and per library. -std::list KernelsList; - template static hsa_status_t FindAgents(Callback CB) { hsa_status_t err = @@ -398,7 +402,7 @@ }; /// Class containing all the device information -class RTLDeviceInfoTy { +class RTLDeviceInfoTy : public DeviceInterfaceTy { HSALifetime HSA; // First field => constructed first and destructed last std::vector> FuncGblEntries; @@ -526,7 +530,8 @@ } // Record entry point associated with device - void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { + void addOffloadEntry(int32_t device_id, + const __tgt_offload_entry entry) override { assert(device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); @@ -534,6 +539,82 @@ E.Entries.push_back(entry); } + GlobalHandlerTy *getGlobalHandler() override { return &GlobalHandler; } + + int64_t getRequiresFlags() override { return RequiresFlags; } + + KernelTy *constructKernelEntry(int32_t DeviceId, + const __tgt_device_image *Image, + const __tgt_offload_entry *KernelEntry, + void *Payload) override { + GenericPluginAPIPayloadTy &GPPayload = + *static_cast(Payload); + KernelInfoTableTy &KernelInfoTable = *GPPayload.KernelInfoTable; + + uint32_t kernarg_segment_size; + hsa_status_t err = interop_hsa_get_kernel_info( + KernelInfoTable, DeviceId, KernelEntry->name, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, + &kernarg_segment_size); + (void)err; + + // get flat group size if present, else Default_WG_Size + int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; + + // get Kernel Descriptor if present. + // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp + struct KernDescValType { + uint16_t Version; + uint16_t TSize; + uint16_t WG_Size; + }; + + const size_t ImageSize = + (char *)Image->ImageEnd - (char *)Image->ImageStart; + + // Read the kernel description global from the binary. + StaticGlobalTy KernelDescGlobal(KernelEntry->name, + "_kern_desc"); + if (!GlobalHandler.readGlobalFromImage( + DeviceId, KernelDescGlobal, (char *)Image->ImageStart, ImageSize)) { + // No kernel description available, fallback to work group size global: + // Read work group size global from the binary. + StaticGlobalTy WGSizeGlobal(KernelEntry->name, "_wg_size"); + if (!GlobalHandler.readGlobalFromImage( + DeviceId, WGSizeGlobal, (char *)Image->ImageStart, ImageSize)) { + WGSizeGlobal.setValue(WGSizeVal); + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Failed to work group size for %s, defaulting to %i.", + KernelEntry->name, WGSizeVal); + } + WGSizeVal = WGSizeGlobal.getValue(); + } else if (KernelDescGlobal.getValue().WG_Size) { + if (sizeof(KernDescValType) != KernelDescGlobal.getValue().TSize) + DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", + sizeof(KernDescValType), KernelDescGlobal.getValue().TSize, + KernelDescGlobal.getName().c_str()); + WGSizeVal = KernelDescGlobal.getValue().WG_Size; + } + + // Read execution mode global from the binary + StaticGlobalTy ExecModeGlobal( + KernelEntry->name, "_exec_mode"); + if (!GlobalHandler.readGlobalFromImage( + DeviceId, ExecModeGlobal, (char *)Image->ImageStart, ImageSize)) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Failed to read execution mode for %s, defaulting to SPMD.", + KernelEntry->name); + ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_SPMD); + } + + auto &KernelList = getKernelsList(DeviceId); + void *CallStackAddr = nullptr; + KernelList.push_back(KernelTy(ExecModeGlobal.getValue(), WGSizeVal, + DeviceId, CallStackAddr, KernelEntry->name, + kernarg_segment_size, KernArgPool)); + return &KernelList.back(); + } + // Return true if the entry is associated with device bool findOffloadEntry(int32_t device_id, void *addr) { assert(device_id < (int32_t)FuncGblEntries.size() && @@ -733,6 +814,8 @@ DP("There are %d devices supporting HSA.\n", NumberOfDevices); } + initDeviceInterface(NumberOfDevices); + // Init the device info HSAQueues.resize(NumberOfDevices); FuncGblEntries.resize(NumberOfDevices); @@ -1182,7 +1265,7 @@ // - Write the pointer to the symbol omptarget_nvptx_device_State // // - Pulls some per-kernel information together from various sources and - // records it in the KernelsList for quicker access later + // records it in the KernelList for quicker access later // // The initialization can be done before or after loading the image onto the // gpu. This function presently does a mixture. Using the hsa api to get/set @@ -1214,10 +1297,10 @@ static_cast(DeviceInfo.NumberOfDevices), static_cast(device_id), static_cast(DynamicMemorySize)); - auto &SymbolInfoTable = DeviceInfo.SymbolInfoTables[device_id]; + GenericPluginAPIPayloadTy Payload(DeviceInfo.SymbolInfoTables[device_id]); // TODO: Implement "writeGlobalToImage" in the GlobalHandler. if (!DeviceInfo.GlobalHandler.writeGlobalToDevice( - device_id, DeviceEnvGlobal, &SymbolInfoTable)) { + device_id, DeviceEnvGlobal, &Payload)) { INFO(OMP_INFOTYPE_PLUGIN_KERNEL, device_id, "Failed to write device environment, abort."); // TODO: Check the device gfx name against the image gfx name. @@ -1232,9 +1315,9 @@ // needs to be assigned to a pointer to an array of size device_state_bytes // If absent, it has been deadstripped and needs no setup. StaticGlobalTy DeviceStateGlobal("omptarget_nvptx_device_State"); - auto &SymbolInfoMap = DeviceInfo.SymbolInfoTables[device_id]; + GenericPluginAPIPayloadTy Payload(DeviceInfo.SymbolInfoTables[device_id]); if (!DeviceInfo.GlobalHandler.getGlobalMetadataFromDevice( - device_id, DeviceStateGlobal, &SymbolInfoMap)) { + device_id, DeviceStateGlobal, &Payload)) { DP("No device_state pointer symbol found, skipping initialization\n"); } else { StaticGlobalTy DeviceStateSizeGlobal( @@ -1270,134 +1353,15 @@ // write ptr to device memory so it can be used by later kernels DeviceStateGlobal.setValue(DSS.first.get()); if (!DeviceInfo.GlobalHandler.writeGlobalToDevice( - device_id, DeviceStateGlobal, &SymbolInfoMap)) { + device_id, DeviceStateGlobal, &Payload)) { DP("memcpy install of state_ptr failed\n"); return NULL; } } } - // Here, we take advantage of the data that is appended after img_end to get - // the symbols' name we need to load. This data consist of the host entries - // begin and end as well as the target name (see the offloading linker script - // creation in clang compiler). - - // Find the symbols in the module by name. The name can be obtain by - // concatenating the host entry name with the target name - - __tgt_offload_entry *HostBegin = image->EntriesBegin; - __tgt_offload_entry *HostEnd = image->EntriesEnd; - - // TODO: This is basically the same in the AMDGPU and CUDA plugin, - // refactor. - for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { - - if (!e->addr) { - // The host should have always something in the address to - // uniquely identify the target region. - DP("Analyzing host entry '' (size = %lld)...\n", - (unsigned long long)e->size); - return NULL; - } - - if (e->size) { - __tgt_offload_entry entry = *e; - - StaticGlobalTy Global(e->name); - auto &SymbolInfoMap = DeviceInfo.SymbolInfoTables[device_id]; - if (!DeviceInfo.GlobalHandler.getGlobalMetadataFromDevice( - device_id, Global, &SymbolInfoMap)) - return nullptr; - entry.addr = Global.getPtr(); - - DeviceInfo.addOffloadEntry(device_id, entry); - - if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && - e->flags & OMP_DECLARE_TARGET_LINK) { - // If unified memory is present any target link variables - // can access host addresses directly. There is no longer a - // need for device copies. - Global.setValue(e->addr); - if (!DeviceInfo.GlobalHandler.writeGlobalToDevice(device_id, Global, - &SymbolInfoMap)) - return nullptr; - } - - continue; - } - - DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name)); - - uint32_t kernarg_segment_size; - auto &KernelInfoMap = DeviceInfo.KernelInfoTables[device_id]; - hsa_status_t err = interop_hsa_get_kernel_info( - KernelInfoMap, device_id, e->name, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, - &kernarg_segment_size); - (void)err; - - // each arg is a void * in this openmp implementation - uint32_t arg_num = kernarg_segment_size / sizeof(void *); - std::vector arg_sizes(arg_num); - for (std::vector::iterator it = arg_sizes.begin(); - it != arg_sizes.end(); it++) { - *it = sizeof(void *); - } - - // get flat group size if present, else Default_WG_Size - int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; - - // get Kernel Descriptor if present. - // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp - struct KernDescValType { - uint16_t Version; - uint16_t TSize; - uint16_t WG_Size; - }; - - // Read the kernel description global from the binary. - StaticGlobalTy KernelDescGlobal(e->name, "_kern_desc"); - if (!DeviceInfo.GlobalHandler.readGlobalFromImage( - device_id, KernelDescGlobal, (char *)image->ImageStart, img_size)) { - // No kernel description available, fallback to work group size global: - // Read work group size global from the binary. - StaticGlobalTy WGSizeGlobal(e->name, "_wg_size"); - if (!DeviceInfo.GlobalHandler.readGlobalFromImage( - device_id, WGSizeGlobal, (char *)image->ImageStart, img_size)) { - WGSizeGlobal.setValue(WGSizeVal); - INFO(OMP_INFOTYPE_DATA_TRANSFER, device_id, - "Failed to work group size for %s, defaulting to %i.", e->name, - WGSizeVal); - } - WGSizeVal = WGSizeGlobal.getValue(); - } else if (KernelDescGlobal.getValue().WG_Size) { - if (sizeof(KernDescValType) != KernelDescGlobal.getValue().TSize) - DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", - sizeof(KernDescValType), KernelDescGlobal.getValue().TSize, - KernelDescGlobal.getName().c_str()); - WGSizeVal = KernelDescGlobal.getValue().WG_Size; - } - - // Read execution mode global from the binary - StaticGlobalTy ExecModeGlobal(e->name, - "_exec_mode"); - if (!DeviceInfo.GlobalHandler.readGlobalFromImage( - device_id, ExecModeGlobal, (char *)image->ImageStart, img_size)) { - INFO(OMP_INFOTYPE_DATA_TRANSFER, device_id, - "Failed to read execution mode for %s, defaulting to SPMD.", - e->name); - ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_SPMD); - } - - void *CallStackAddr = nullptr; - KernelsList.push_back( - KernelTy(ExecModeGlobal.getValue(), WGSizeVal, device_id, CallStackAddr, - e->name, kernarg_segment_size, DeviceInfo.KernArgPool)); - __tgt_offload_entry entry = *e; - entry.addr = (void *)&KernelsList.back(); - DeviceInfo.addOffloadEntry(device_id, entry); - DP("Entry point %ld maps to %s\n", e - HostBegin, e->name); - } + GenericPluginAPIPayloadTy Payload(DeviceInfo.KernelInfoTables[device_id]); + DeviceInfo.registerOffloadEntries(device_id, image, &Payload); return DeviceInfo.getOffloadEntriesTable(device_id); } diff --git a/openmp/libomptarget/plugins/common/CMakeLists.txt b/openmp/libomptarget/plugins/common/CMakeLists.txt --- a/openmp/libomptarget/plugins/common/CMakeLists.txt +++ b/openmp/libomptarget/plugins/common/CMakeLists.txt @@ -14,3 +14,4 @@ add_subdirectory(elf_common) add_subdirectory(MemoryManager) add_subdirectory(GlobalHandler) +add_subdirectory(KernelHandler) diff --git a/openmp/libomptarget/plugins/common/DeviceInterface/DeviceInterface.h b/openmp/libomptarget/plugins/common/DeviceInterface/DeviceInterface.h --- a/openmp/libomptarget/plugins/common/DeviceInterface/DeviceInterface.h +++ b/openmp/libomptarget/plugins/common/DeviceInterface/DeviceInterface.h @@ -11,34 +11,63 @@ #ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_DEVICEINTERFACE_DEVICEINTERFACE_H #define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_DEVICEINTERFACE_DEVICEINTERFACE_H -#include "omptarget.h" +#include +#include -extern "C" { -int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, - int64_t Size); -int32_t __tgt_rtl_data_submit_async(int32_t ID, void *TargetPtr, void *HostPtr, - int64_t Size, __tgt_async_info *AsyncInfo); -int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, - int64_t Size); -int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr, - void *TargetPtr, int64_t Size, - __tgt_async_info *AsyncInfo); -} +#include "Debug.h" +#include "omptarget.h" namespace llvm { namespace omp { namespace plugin { -struct DeviceInterfaceTy { +class GlobalHandlerTy; + +template struct DeviceInterfaceTy { + /// TODO: We probably want a device data abstraction similar to the one we + /// have in the CUDA plugin. List that contains all the kernels. + std::vector> KernelLists; + + virtual ~DeviceInterfaceTy() {} + + int32_t registerOffloadEntries(int32_t DeviceId, + const __tgt_device_image *Image, + void *Payload); + static int32_t memcpyDtoH(int32_t DeviceId, void *Dst, const void *Src, - int32_t Size) { - return __tgt_rtl_data_retrieve(DeviceId, Dst, const_cast(Src), - Size); - } + int32_t Size); static int32_t memcpyHtoD(int32_t DeviceId, void *Dst, const void *Src, - int32_t Size) { - return __tgt_rtl_data_submit(DeviceId, Dst, const_cast(Src), Size); + int32_t Size); + +protected: + void initDeviceInterface(int32_t NumDevices) { + KernelLists.resize(NumDevices); } + + virtual void addOffloadEntry(int32_t DeviceId, + const __tgt_offload_entry Entry) = 0; + virtual GlobalHandlerTy *getGlobalHandler() = 0; + + virtual int64_t getRequiresFlags() = 0; + + virtual KernelTy *constructKernelEntry(int32_t DeviceId, + const __tgt_device_image *Image, + const __tgt_offload_entry *KernelEntry, + void *Payload) = 0; + + std::list &getKernelsList(int32_t DeviceId) const { + return KernelLists[DeviceId]; + } + +private: + int32_t registerGlobalOffloadEntry(int32_t DeviceId, + const __tgt_device_image *Image, + const __tgt_offload_entry *GlobalEntry, + void *Payload); + int32_t registerKernelOffloadEntry(int32_t DeviceId, + const __tgt_device_image *Image, + const __tgt_offload_entry *KernelEntry, + void *Payload); }; } // namespace plugin diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -28,6 +28,7 @@ #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" #include "GlobalHandler.h" +#include "KernelHandler.h" #include "MemoryManager.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" @@ -175,9 +176,6 @@ // Structure contains per-device data struct DeviceDataTy { - /// List that contains all the kernels. - std::list KernelsList; - std::list FuncGblEntries; CUcontext Context = nullptr; @@ -324,7 +322,7 @@ } }; -class DeviceRTLTy { +class DeviceRTLTy : public DeviceInterfaceTy { /// The debug/configuration kind we read from LIBOMPTARGET_DEVICE_RTL_DEBUG uint32_t DebugKind; @@ -437,11 +435,48 @@ bool UseMemoryManager = true; // Record entry point associated with device - void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { + void addOffloadEntry(int32_t DeviceId, + const __tgt_offload_entry entry) override { FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); E.Entries.push_back(entry); } + GlobalHandlerTy *getGlobalHandler() override { return &GlobalHandler; } + + int64_t getRequiresFlags() override { return RequiresFlags; } + + KernelTy *constructKernelEntry(int32_t DeviceId, + const __tgt_device_image *Image, + const __tgt_offload_entry *KernelEntry, + void *Payload) override { + CUmodule Module = static_cast(Payload); + CUfunction Func; + CUresult Err = cuModuleGetFunction(&Func, Module, KernelEntry->name); + // We keep this style here because we need the name + if (Err != CUDA_SUCCESS) { + REPORT("Loading '%s' Failed\n", KernelEntry->name); + CUDA_ERR_STRING(Err); + return nullptr; + } + + DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", DPxPTR(KernelEntry), + KernelEntry->name, DPxPTR(Func)); + + StaticGlobalTy ExecModeGlobal( + KernelEntry->name, "_exec_mode"); + // TODO: We should be able to read it from the image instead. + if (!GlobalHandler.readGlobalFromDevice(DeviceId, ExecModeGlobal, Module)) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, + "Failed to read execution mode for %s, defaulting to SPMD.", + KernelEntry->name); + ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_SPMD); + } + + auto &KernelList = getKernelsList(DeviceId); + KernelList.emplace_back(Func, ExecModeGlobal.getValue()); + return &KernelList.back(); + } + // Return a pointer to the entry associated with the pointer const __tgt_offload_entry *getOffloadEntry(const int DeviceId, const void *Addr) const { @@ -515,6 +550,7 @@ return; } + initDeviceInterface(NumberOfDevices); DeviceData.resize(NumberOfDevices); // Get environment variables regarding teams @@ -782,82 +818,7 @@ Modules.push_back(Module); - // Find the symbols in the module by name. - const __tgt_offload_entry *HostBegin = Image->EntriesBegin; - const __tgt_offload_entry *HostEnd = Image->EntriesEnd; - - // TODO: This is basically the same in the AMDGPU and CUDA plugin, - // refactor. - std::list &KernelsList = DeviceData[DeviceId].KernelsList; - for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { - if (!E->addr) { - // We return nullptr when something like this happens, the host should - // have always something in the address to uniquely identify the target - // region. - DP("Invalid binary: host entry '' (size = %zd)...\n", E->size); - return nullptr; - } - - if (E->size) { - __tgt_offload_entry Entry = *E; - StaticGlobalTy Global(E->name); - if (!GlobalHandler.getGlobalMetadataFromDevice(DeviceId, Global, - Module)) - return nullptr; - Entry.addr = Global.getPtr(); - - // Note: In the current implementation declare target variables - // can either be link or to. This means that once unified - // memory is activated via the requires directive, the variable - // can be used directly from the host in both cases. - // TODO: when variables types other than to or link are added, - // the below condition should be changed to explicitly - // check for to and link variables types: - // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & - // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) - if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { - // If unified memory is present any target link or to variables - // can access host addresses directly. There is no longer a - // need for device copies. - Global.setValue(E->addr); - if (!GlobalHandler.writeGlobalToDevice(DeviceId, Global, Module)) - return nullptr; - } - - addOffloadEntry(DeviceId, Entry); - - continue; - } - - CUfunction Func; - Err = cuModuleGetFunction(&Func, Module, E->name); - // We keep this style here because we need the name - if (Err != CUDA_SUCCESS) { - REPORT("Loading '%s' Failed\n", E->name); - CUDA_ERR_STRING(Err); - return nullptr; - } - - DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", - DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); - - StaticGlobalTy ExecModeGlobal( - E->name, "_exec_mode"); - // TODO: We should be able to read it from the image instead. - if (!GlobalHandler.readGlobalFromDevice(DeviceId, ExecModeGlobal, - Module)) { - INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId, - "Failed to read execution mode for %s, defaulting to SPMD.", - E->name); - ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_SPMD); - } - - KernelsList.emplace_back(Func, ExecModeGlobal.getValue()); - - __tgt_offload_entry Entry = *E; - Entry.addr = &KernelsList.back(); - addOffloadEntry(DeviceId, Entry); - } + registerOffloadEntries(DeviceId, Image, Module); // send device environment data to the device { @@ -1376,7 +1337,7 @@ DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", DPxPTR(E - HostBegin), Name, DPxPTR(CUPtr)); - DeviceGlobal.setPtr(reinterpret_cast(CUPtr)); + DeviceGlobal.setPtr(reinterpret_cast(CUPtr)); return OFFLOAD_SUCCESS; }