diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -15,11 +15,15 @@ #define _OMPTARGET_H_ #include +#include #include #include +#include #include +#include "llvm/ADT/SmallVector.h" + #define OFFLOAD_SUCCESS (0) #define OFFLOAD_FAIL (~0) @@ -185,10 +189,19 @@ /// as long as this AsyncInfoTy object. std::deque BufferLocations; + /// Post-processing operations executed after a successful synchronization. + using PostProcFuncTy = std::function; + llvm::SmallVector PostProcessingFunctions; + __tgt_async_info AsyncInfo; DeviceTy &Device; public: + enum class SyncType { + BLOCKING, + NON_BLOCKING + }; + AsyncInfoTy(DeviceTy &Device) : Device(Device) {} ~AsyncInfoTy() { synchronize(); } @@ -199,11 +212,22 @@ /// Synchronize all pending actions. /// /// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS appropriately. - int synchronize(); + int synchronize(SyncType SyncType = SyncType::BLOCKING); /// Return a void* reference with a lifetime that is at least as long as this /// AsyncInfoTy object. The location can be used as intermediate buffer. void *&getVoidPtrLocation(); + + /// Returns if all asynchronous operations are completed. + bool isDone(); + + template + void addPostProcessingFunction(FuncTy Function) { + static_assert(std::is_convertible_v, + "Invalid post-processing function type. Please check " + "function signature!"); + PostProcessingFunctions.emplace_back(Function); + } }; /// This struct is a record of non-contiguous information diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -107,8 +107,41 @@ void *NoAliasDepList) { TIMESCOPE_WITH_IDENT(Loc); - __tgt_target_data_begin_mapper(Loc, DeviceId, ArgNum, ArgsBase, Args, - ArgSizes, ArgTypes, ArgNames, ArgMappers); + DP("Entering data begin region for device %" PRId64 " with %d mappings\n", + DeviceId, ArgNum); + if (checkDeviceAndCtors(DeviceId, Loc)) { + DP("Not offloading to device %" PRId64 "\n", DeviceId); + return; + } + + DeviceTy &Device = *PM->Devices[DeviceId]; + + if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) + printKernelArguments(Loc, DeviceId, ArgNum, ArgSizes, ArgTypes, ArgNames, + "Entering OpenMP data region"); +#ifdef OMPTARGET_DEBUG + for (int I = 0; I < ArgNum; ++I) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 ", Name=%s\n", + I, DPxPTR(ArgsBase[I]), DPxPTR(Args[I]), ArgSizes[I], ArgTypes[I], + (ArgNames) ? getNameFromMapping(ArgNames[I]).c_str() : "unknown"); + } +#endif + + bool ShouldDispatch = true; + int GTID = __kmpc_global_thread_num(NULL); + AsyncInfoTy *AsyncInfo = acquireTaskAsyncInfo(GTID, Device, ShouldDispatch); + + int Rc = OFFLOAD_SUCCESS; + if (ShouldDispatch) { + Rc = targetDataBegin(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, + ArgTypes, ArgNames, ArgMappers, *AsyncInfo); + } + if (Rc == OFFLOAD_SUCCESS) + Rc = AsyncInfo->synchronize(AsyncInfoTy::SyncType::NON_BLOCKING); + handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); + + completeTaskAsyncInfo(GTID, AsyncInfo); } /// passes data from the target, releases target memory and destroys @@ -155,9 +188,40 @@ void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, void *NoAliasDepList) { TIMESCOPE_WITH_IDENT(Loc); + DP("Entering data end region with %d mappings\n", ArgNum); + if (checkDeviceAndCtors(DeviceId, Loc)) { + DP("Not offloading to device %" PRId64 "\n", DeviceId); + return; + } - __tgt_target_data_end_mapper(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, - ArgTypes, ArgNames, ArgMappers); + DeviceTy &Device = *PM->Devices[DeviceId]; + + if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) + printKernelArguments(Loc, DeviceId, ArgNum, ArgSizes, ArgTypes, ArgNames, + "Exiting OpenMP data region"); +#ifdef OMPTARGET_DEBUG + for (int I = 0; I < ArgNum; ++I) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 ", Name=%s\n", + I, DPxPTR(ArgsBase[I]), DPxPTR(Args[I]), ArgSizes[I], ArgTypes[I], + (ArgNames) ? getNameFromMapping(ArgNames[I]).c_str() : "unknown"); + } +#endif + + bool ShouldDispatch = true; + int GTID = __kmpc_global_thread_num(NULL); + AsyncInfoTy *AsyncInfo = acquireTaskAsyncInfo(GTID, Device, ShouldDispatch); + + int Rc = OFFLOAD_SUCCESS; + if (ShouldDispatch) { + Rc = targetDataEnd(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, + ArgNames, ArgMappers, *AsyncInfo); + } + if (Rc == OFFLOAD_SUCCESS) + Rc = AsyncInfo->synchronize(AsyncInfoTy::SyncType::NON_BLOCKING); + handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); + + completeTaskAsyncInfo(GTID, AsyncInfo); } EXTERN void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId, @@ -192,9 +256,32 @@ void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, void *NoAliasDepList) { TIMESCOPE_WITH_IDENT(Loc); + DP("Entering data update with %d mappings\n", ArgNum); + if (checkDeviceAndCtors(DeviceId, Loc)) { + DP("Not offloading to device %" PRId64 "\n", DeviceId); + return; + } + + if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) + printKernelArguments(Loc, DeviceId, ArgNum, ArgSizes, ArgTypes, ArgNames, + "Updating OpenMP data"); + + DeviceTy &Device = *PM->Devices[DeviceId]; - __tgt_target_data_update_mapper(Loc, DeviceId, ArgNum, ArgsBase, Args, - ArgSizes, ArgTypes, ArgNames, ArgMappers); + bool ShouldDispatch = true; + int GTID = __kmpc_global_thread_num(NULL); + AsyncInfoTy *AsyncInfo = acquireTaskAsyncInfo(GTID, Device, ShouldDispatch); + + int Rc = OFFLOAD_SUCCESS; + if (ShouldDispatch) { + Rc = targetDataUpdate(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, + ArgTypes, ArgNames, ArgMappers, *AsyncInfo); + } + if (Rc == OFFLOAD_SUCCESS) + Rc = AsyncInfo->synchronize(AsyncInfoTy::SyncType::NON_BLOCKING); + handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); + + completeTaskAsyncInfo(GTID, AsyncInfo); } /// Implements a kernel entry that executes the target region on the specified @@ -260,9 +347,57 @@ void *HostPtr, __tgt_kernel_arguments *Args, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, void *NoAliasDepList) { TIMESCOPE_WITH_IDENT(Loc); + DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 + "\n", + DPxPTR(HostPtr), DeviceId); + if (Args->Version != 1) { + DP("Unexpected ABI version: %d\n", Args->Version); + } + if (checkDeviceAndCtors(DeviceId, Loc)) { + DP("Not offloading to device %" PRId64 "\n", DeviceId); + return OMP_TGT_FAIL; + } - return __tgt_target_kernel(Loc, DeviceId, NumTeams, ThreadLimit, HostPtr, - Args); + if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) + printKernelArguments(Loc, DeviceId, Args->NumArgs, Args->ArgSizes, + Args->ArgTypes, Args->ArgNames, + "Entering OpenMP kernel"); +#ifdef OMPTARGET_DEBUG + for (int I = 0; I < Args->NumArgs; ++I) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 ", Name=%s\n", + I, DPxPTR(Args->ArgBasePtrs[I]), DPxPTR(Args->ArgPtrs[I]), + Args->ArgSizes[I], Args->ArgTypes[I], + (Args->ArgNames) ? getNameFromMapping(Args->ArgNames[I]).c_str() + : "unknown"); + } +#endif + + bool IsTeams = NumTeams != -1; + if (!IsTeams) + NumTeams = 0; + + DeviceTy &Device = *PM->Devices[DeviceId]; + + bool ShouldDispatch = true; + int GTID = __kmpc_global_thread_num(NULL); + AsyncInfoTy *AsyncInfo = acquireTaskAsyncInfo(GTID, Device, ShouldDispatch); + + int Rc = OFFLOAD_SUCCESS; + if (ShouldDispatch) { + Rc = target(Loc, Device, HostPtr, Args->NumArgs, Args->ArgBasePtrs, + Args->ArgPtrs, Args->ArgSizes, Args->ArgTypes, Args->ArgNames, + Args->ArgMappers, NumTeams, ThreadLimit, Args->Tripcount, + IsTeams, *AsyncInfo); + } + if (Rc == OFFLOAD_SUCCESS) + Rc = AsyncInfo->synchronize(AsyncInfoTy::SyncType::NON_BLOCKING); + handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); + + completeTaskAsyncInfo(GTID, AsyncInfo); + + assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!"); + return OMP_TGT_SUCCESS; } // Get the current number of components for a user-defined mapper. diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -22,15 +22,33 @@ using llvm::SmallVector; -int AsyncInfoTy::synchronize() { +int AsyncInfoTy::synchronize(SyncType SyncType) { int Result = OFFLOAD_SUCCESS; if (AsyncInfo.Queue) { - // If we have a queue we need to synchronize it now. - Result = Device.synchronize(*this); - assert(AsyncInfo.Queue == nullptr && - "The device plugin should have nulled the queue to indicate there " - "are no outstanding actions!"); + switch (SyncType) { + case SyncType::BLOCKING: + // If we have a queue we need to synchronize it now. + Result = Device.synchronize(*this); + assert(AsyncInfo.Queue == nullptr && + "The device plugin should have nulled the queue to indicate there " + "are no outstanding actions!"); + break; + case SyncType::NON_BLOCKING: + Result = Device.synchronizeAsync(*this); + break; + } + } + + // Run any pending post-processing function registered on this async object. + if (Result == OFFLOAD_SUCCESS && isDone()) { + for (auto &PostProcFunc : PostProcessingFunctions) { + Result = PostProcFunc(); + if (Result != OFFLOAD_SUCCESS) + break; + } + PostProcessingFunctions.clear(); } + return Result; } @@ -39,6 +57,8 @@ return BufferLocations.back(); } +bool AsyncInfoTy::isDone() { return AsyncInfo.Queue == nullptr; } + /* All begin addresses for partially mapped structs must be 8-aligned in order * to ensure proper alignment of members. E.g. * @@ -680,7 +700,7 @@ void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { - int Ret; + int Ret = OFFLOAD_SUCCESS; SmallVector PostProcessingPtrs; void *FromMapperBase = nullptr; // process each input. @@ -839,75 +859,80 @@ } } - // TODO: We should not synchronize here but pass the AsyncInfo object to the - // allocate/deallocate device APIs. - // - // We need to synchronize before deallocating data. - Ret = AsyncInfo.synchronize(); - if (Ret != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - // Deallocate target pointer - for (PostProcessingInfo &Info : PostProcessingPtrs) { - // If we marked the entry to be deleted we need to verify no other thread - // reused it by now. If deletion is still supposed to happen by this thread - // LR will be set and exclusive access to the HDTT map will avoid another - // thread reusing the entry now. Note that we do not request (exclusive) - // access to the HDTT map if Info.DelEntry is not set. - LookupResult LR; - DeviceTy::HDTTMapAccessorTy HDTTMap = - Device.HostDataToTargetMap.getExclusiveAccessor(!Info.DelEntry); - - if (Info.DelEntry) { - LR = Device.lookupMapping(HDTTMap, Info.HstPtrBegin, Info.DataSize); - if (LR.Entry->getTotalRefCount() != 0 || - LR.Entry->getDeleteThreadId() != std::this_thread::get_id()) { - // The thread is not in charge of deletion anymore. Give up access to - // the HDTT map and unset the deletion flag. - HDTTMap.destroy(); - Info.DelEntry = false; - } - } - - // If we copied back to the host a struct/array containing pointers, we - // need to restore the original host pointer values from their shadow - // copies. If the struct is going to be deallocated, remove any remaining - // shadow pointer entries for this struct. - auto CB = [&](ShadowPtrListTy::iterator &Itr) { - // If we copied the struct to the host, we need to restore the pointer. - if (Info.ArgType & OMP_TGT_MAPTYPE_FROM) { - void **ShadowHstPtrAddr = (void **)Itr->first; - *ShadowHstPtrAddr = Itr->second.HstPtrVal; - DP("Restoring original host pointer value " DPxMOD " for host " - "pointer " DPxMOD "\n", - DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); - } - // If the struct is to be deallocated, remove the shadow entry. - if (Info.DelEntry) { - DP("Removing shadow pointer " DPxMOD "\n", DPxPTR((void **)Itr->first)); - auto OldItr = Itr; - Itr++; - Device.ShadowPtrMap.erase(OldItr); - } else { - ++Itr; - } - return OFFLOAD_SUCCESS; - }; - applyToShadowMapEntries(Device, CB, Info.HstPtrBegin, Info.DataSize, - Info.TPR); + // Add post-processing functions + AsyncInfo.addPostProcessingFunction( + [=, Device = &Device, + PostProcessingPtrs = std::move(PostProcessingPtrs)]() mutable -> int { + int Ret = OFFLOAD_SUCCESS; + + // Deallocate target pointer + for (PostProcessingInfo &Info : PostProcessingPtrs) { + // If we marked the entry to be deleted we need to verify no other + // thread reused it by now. If deletion is still supposed to happen by + // this thread LR will be set and exclusive access to the HDTT map + // will avoid another thread reusing the entry now. Note that we do + // not request (exclusive) access to the HDTT map if Info.DelEntry is + // not set. + LookupResult LR; + DeviceTy::HDTTMapAccessorTy HDTTMap = + Device->HostDataToTargetMap.getExclusiveAccessor(!Info.DelEntry); + + if (Info.DelEntry) { + LR = + Device->lookupMapping(HDTTMap, Info.HstPtrBegin, Info.DataSize); + if (LR.Entry->getTotalRefCount() != 0 || + LR.Entry->getDeleteThreadId() != std::this_thread::get_id()) { + // The thread is not in charge of deletion anymore. Give up access + // to the HDTT map and unset the deletion flag. + HDTTMap.destroy(); + Info.DelEntry = false; + } + } - // If we are deleting the entry the DataMapMtx is locked and we own the - // entry. - if (Info.DelEntry) { - if (!FromMapperBase || FromMapperBase != Info.HstPtrBegin) - Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize); + // If we copied back to the host a struct/array containing pointers, + // we need to restore the original host pointer values from their + // shadow copies. If the struct is going to be deallocated, remove any + // remaining shadow pointer entries for this struct. + auto CB = [&](ShadowPtrListTy::iterator &Itr) { + // If we copied the struct to the host, we need to restore the + // pointer. + if (Info.ArgType & OMP_TGT_MAPTYPE_FROM) { + void **ShadowHstPtrAddr = (void **)Itr->first; + *ShadowHstPtrAddr = Itr->second.HstPtrVal; + DP("Restoring original host pointer value " DPxMOD " for host " + "pointer " DPxMOD "\n", + DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); + } + // If the struct is to be deallocated, remove the shadow entry. + if (Info.DelEntry) { + DP("Removing shadow pointer " DPxMOD "\n", + DPxPTR((void **)Itr->first)); + auto OldItr = Itr; + Itr++; + Device->ShadowPtrMap.erase(OldItr); + } else { + ++Itr; + } + return OFFLOAD_SUCCESS; + }; + applyToShadowMapEntries(*Device, CB, Info.HstPtrBegin, Info.DataSize, + Info.TPR); + + // If we are deleting the entry the DataMapMtx is locked and we own + // the entry. + if (Info.DelEntry) { + if (!FromMapperBase || FromMapperBase != Info.HstPtrBegin) + Ret = Device->deallocTgtPtr(HDTTMap, LR, Info.DataSize); + + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Deallocating data from device failed.\n"); + break; + } + } + } - if (Ret != OFFLOAD_SUCCESS) { - REPORT("Deallocating data from device failed.\n"); - break; - } - } - } + return Ret; + }); return Ret; } @@ -947,20 +972,22 @@ return OFFLOAD_FAIL; } - auto CB = [&](ShadowPtrListTy::iterator &Itr) { - void **ShadowHstPtrAddr = (void **)Itr->first; - // Wait for device-to-host memcopies for whole struct to complete, - // before restoring the correct host pointer. - if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - *ShadowHstPtrAddr = Itr->second.HstPtrVal; - DP("Restoring original host pointer value " DPxMOD - " for host pointer " DPxMOD "\n", - DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); - ++Itr; + // Wait for device-to-host memcopies for whole struct to complete, + // before restoring the correct host pointer. + AsyncInfo.addPostProcessingFunction([=, Device = &Device]() -> int { + auto CB = [&](ShadowPtrListTy::iterator &Itr) { + void **ShadowHstPtrAddr = (void **)Itr->first; + *ShadowHstPtrAddr = Itr->second.HstPtrVal; + DP("Restoring original host pointer value " DPxMOD + " for host pointer " DPxMOD "\n", + DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); + ++Itr; + return OFFLOAD_SUCCESS; + }; + applyToShadowMapEntries(*Device, CB, HstPtrBegin, ArgSize, TPR); + return OFFLOAD_SUCCESS; - }; - applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR); + }); } if (ArgType & OMP_TGT_MAPTYPE_TO) { @@ -1157,19 +1184,19 @@ /// first-private arguments and transfer them all at once. struct FirstPrivateArgInfoTy { /// The index of the element in \p TgtArgs corresponding to the argument - const int Index; + int Index; /// Host pointer begin - const char *HstPtrBegin; + char *HstPtrBegin; /// Host pointer end - const char *HstPtrEnd; + char *HstPtrEnd; /// Aligned size - const int64_t AlignedSize; + int64_t AlignedSize; /// Host pointer name - const map_var_info_t HstPtrName = nullptr; + map_var_info_t HstPtrName = nullptr; - FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size, + FirstPrivateArgInfoTy(int Index, void *HstPtr, int64_t Size, const map_var_info_t HstPtrName = nullptr) - : Index(Index), HstPtrBegin(reinterpret_cast(HstPtr)), + : Index(Index), HstPtrBegin(reinterpret_cast(HstPtr)), HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment), HstPtrName(HstPtrName) {} }; @@ -1471,12 +1498,17 @@ return OFFLOAD_FAIL; } - // Free target memory for private arguments - Ret = PrivateArgumentManager.free(); - if (Ret != OFFLOAD_SUCCESS) { - REPORT("Failed to deallocate target memory for private args\n"); - return OFFLOAD_FAIL; - } + // Free target memory for private arguments after synchronization. + AsyncInfo.addPostProcessingFunction( + [PrivateArgumentManager = + std::move(PrivateArgumentManager)]() mutable -> int { + int Ret = PrivateArgumentManager.free(); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Failed to deallocate target memory for private args\n"); + return OFFLOAD_FAIL; + } + return Ret; + }); return OFFLOAD_SUCCESS; } @@ -1530,7 +1562,7 @@ PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo); - int Ret; + int Ret = OFFLOAD_SUCCESS; if (ArgNum) { // Process data, such as data mapping, before launching the kernel Ret = processDataBefore(Loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -115,6 +115,9 @@ kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list) __attribute__((weak)); +void *__kmpc_omp_get_target_async_handle(kmp_int32 gtid) __attribute__((weak)); +void __kmpc_omp_set_target_async_handle(kmp_int32 gtid, void *handle) + __attribute__((weak)); #ifdef __cplusplus } #endif @@ -187,6 +190,27 @@ } } +static inline AsyncInfoTy *acquireTaskAsyncInfo(int GTID, DeviceTy &Device, + bool &IsNew) { + auto *AsyncInfo = (AsyncInfoTy *)__kmpc_omp_get_target_async_handle(GTID); + IsNew = false; + + if (!AsyncInfo) { + AsyncInfo = new AsyncInfoTy(Device); + __kmpc_omp_set_target_async_handle(GTID, (void *)AsyncInfo); + IsNew = true; + } + + return AsyncInfo; +} + +static inline void completeTaskAsyncInfo(int GTID, AsyncInfoTy *AsyncInfo) { + if (AsyncInfo->isDone()) { + delete AsyncInfo; + __kmpc_omp_set_target_async_handle(GTID, NULL); + } +} + #ifdef OMPTARGET_PROFILE_ENABLED #include "llvm/Support/TimeProfiler.h" #define TIMESCOPE() llvm::TimeTraceScope TimeScope(__FUNCTION__)