Index: openmp/docs/optimizations/OpenMPOpt.rst =================================================================== --- openmp/docs/optimizations/OpenMPOpt.rst +++ openmp/docs/optimizations/OpenMPOpt.rst @@ -100,6 +100,58 @@ inform the user if any globalization calls remain if remarks are enabled. This should be treated as a defect in the program. +.. _Others: + +Others +========= + +.. contents:: + :local: + :depth: 1 + +Automatic Asynchronous Execution of Target Regions +-------------------------------------------------- + +Offloaded regions can be executed synchronously or asynchronously +depending on the pragmas instruction (e.g. nowait) and implementation. +The environment flag `LIBOMPTARGET_INTRA_THREAD_ASYNC=1` allows asynchronous +execution of OpenMP Target Regions on Nvidia GPUs by synchronizing +only on memory transfers or at the end of the program when the +OpenMP runtime is torn down. + +.. code-block:: c++ + LIBOMPTARGET_ASYNC=1 ./simple1 //For async execution + +For example: +.. code-block:: c++ + #pragma omp target enter data map(alloc: array[:N]) + #pragma omp target map(tofrom: array[:N]) + busy_device(array); //TT1 + #pragma omp target map(tofrom: array[:N]) + busy_device(array); //TT2 + busy_host(); + #pragma omp target exit data map(from: array[:N]) + +In the example above, the target regions inside the enter/exit data region +do not perform any memory transfer, since the mapped memory is already +on the device. So, the host launches the execution of both target +regions (TT1 and TT2) in the device queue and continues the execution +on its end (busy host). Offloaded regions that trigger memory transfers +will not benefit from this optimization. + + +Limitations: +- Cross-device synchronization primitives or atomics are not allowed. +- Timers wrapping target regions will not show the time it takes to + execute the target task but the time the runtime takes to launch + the execution of the task. +- Synchronization using different threads: Target task and + synchronizations in different threads is not supported yet + when the flag for asynchronous asynchronous execution + is enabled. +- RTL of the target device should allow asynchronous execution and + provide a queue mechanism to execute kernels sequentially. + Resources ========= Index: openmp/libomptarget/include/device.h =================================================================== --- openmp/libomptarget/include/device.h +++ openmp/libomptarget/include/device.h @@ -306,6 +306,8 @@ typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy> PendingCtorsDtorsPerLibrary; +class AsyncInfoMng; + struct DeviceTy { int32_t DeviceID; RTLInfoTy *RTL; @@ -334,6 +336,10 @@ std::mutex PendingGlobalsMtx, ShadowMtx; + /// Manages AsyncInfo associated to the device and thread + /// depending on the value of the env variable LIBOMPTARGET_INTRA_THREAD_ASYNC + AsyncInfoMng AIM; + DeviceTy(RTLInfoTy *RTL); // DeviceTy is not copyable DeviceTy(const DeviceTy &D) = delete; @@ -384,7 +390,8 @@ /// not. It is the caller's responsibility to skip calling this function if /// the map entry is not expected to exist because \p HstPtrBegin uses shared /// memory. - int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size); + int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size, + AsyncInfoTy &AsyncInfo); int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); int disassociatePtr(void *HstPtrBegin); Index: openmp/libomptarget/include/omptarget.h =================================================================== --- openmp/libomptarget/include/omptarget.h +++ openmp/libomptarget/include/omptarget.h @@ -14,11 +14,15 @@ #ifndef _OMPTARGET_H_ #define _OMPTARGET_H_ +#include +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" + #include +#include #include #include - -#include +#include #define OFFLOAD_SUCCESS (0) #define OFFLOAD_FAIL (~0) @@ -187,25 +191,70 @@ __tgt_async_info AsyncInfo; DeviceTy &Device; + /// Enables/Disable synchronization in destructor + const bool ShouldSyncWhenDestroyed; public: - AsyncInfoTy(DeviceTy &Device) : Device(Device) {} - ~AsyncInfoTy() { synchronize(); } + AsyncInfoTy(DeviceTy &Device, bool ShouldSyncWhenDestroyed = true) + : Device(Device), ShouldSyncWhenDestroyed(ShouldSyncWhenDestroyed) {} + ~AsyncInfoTy() { + if (ShouldSyncWhenDestroyed) + synchronize(true); + } /// Implicit conversion to the __tgt_async_info which is used in the /// plugin interface. operator __tgt_async_info *() { return &AsyncInfo; } - /// Synchronize all pending actions. - /// + /// Synchronize all pending actions when the LIBOMPTARGET_INTRA_THREAD_ASYNC + /// env var is disabled or when synchronization is forced (ForceSync = true) + /// Otherwise, synchronization is skipped /// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS appropriately. - int synchronize(); + int synchronize(bool ForceSync = false); /// Return a void* reference with a lifetime that is at least as long as this /// AsyncInfoTy object. The location can be used as intermediate buffer. void *&getVoidPtrLocation(); }; +/// This structs allows for automatic asynchronous execution of target regions. +/// It associates an AsyncInfoTy object with a thread id to guarantee that +/// target tasks of the same thread are launched on the same queue. +class AsyncInfoMng { + llvm::DenseMap AsyncInfoM; + DeviceTy &Device; + std::mutex AsyncMtx; + +public: + AsyncInfoMng(DeviceTy &Device); + ~AsyncInfoMng(); + + /// Replaces AI with the associated AsyncInfo object when + /// LIBOMPTARGET_INTRA_THREAD_ASYNC is enabled. + AsyncInfoTy ®isterAI(AsyncInfoTy *AI); +}; + +/// Specialize DenseMapInfo for std::thread::id +template <> struct llvm::DenseMapInfo { + + static std::thread::id getEmptyKey() { + return std::thread::id(); + } + + static std::thread::id getTombstoneKey() { + return std::thread::id(); + } + + static unsigned getHashValue(const std::thread::id &Val) { + static std::hash hasher; + return hasher(Val); + } + + static bool isEqual(std::thread::id LHS, std::thread::id RHS) { + return LHS == RHS; + } +}; + /// This struct is a record of non-contiguous information struct __tgt_target_non_contig { uint64_t Offset; Index: openmp/libomptarget/src/device.cpp =================================================================== --- openmp/libomptarget/src/device.cpp +++ openmp/libomptarget/src/device.cpp @@ -52,7 +52,7 @@ DeviceTy::DeviceTy(RTLInfoTy *RTL) : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), HasPendingGlobals(false), PendingCtorsDtors(), ShadowPtrMap(), - PendingGlobalsMtx(), ShadowMtx() {} + PendingGlobalsMtx(), ShadowMtx(), AIM(*this) {} DeviceTy::~DeviceTy() { if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)) @@ -438,7 +438,7 @@ } int DeviceTy::deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, - int64_t Size) { + int64_t Size, AsyncInfoTy &AsyncInfo) { // Check if the pointer is contained in any sub-nodes. if (!(LR.Flags.IsContained || LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter)) { @@ -453,7 +453,11 @@ assert(HT.getTotalRefCount() == 0 && HT.getDeleteThreadId() == std::this_thread::get_id() && "Trying to delete entry that is in use or owned by another thread."); - + // We need to force synchronization before deallocating data. + int Ret = AsyncInfo.synchronize(true); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + // Delete tgt data DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n", DPxPTR(HT.TgtPtrBegin), Size); deleteData((void *)HT.TgtPtrBegin); @@ -466,7 +470,7 @@ HDTTMap->erase(LR.Entry); delete LR.Entry; - int Ret = OFFLOAD_SUCCESS; + Ret = OFFLOAD_SUCCESS; if (Event && destroyEvent(Event) != OFFLOAD_SUCCESS) { REPORT("Failed to destroy event " DPxMOD "\n", DPxPTR(Event)); Ret = OFFLOAD_FAIL; Index: openmp/libomptarget/src/interface.cpp =================================================================== --- openmp/libomptarget/src/interface.cpp +++ openmp/libomptarget/src/interface.cpp @@ -93,10 +93,11 @@ #endif AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AI = Device.AIM.registerAI(&AsyncInfo); int Rc = targetDataBegin(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, - ArgTypes, ArgNames, ArgMappers, AsyncInfo); + ArgTypes, ArgNames, ArgMappers, AI); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = AI.synchronize(true); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -142,10 +143,11 @@ #endif AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AI = Device.AIM.registerAI(&AsyncInfo); int Rc = targetDataEnd(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, - ArgTypes, ArgNames, ArgMappers, AsyncInfo); + ArgTypes, ArgNames, ArgMappers, AI); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = AI.synchronize(true); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -179,10 +181,11 @@ DeviceTy &Device = *PM->Devices[DeviceId]; AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AI = Device.AIM.registerAI(&AsyncInfo); int Rc = targetDataUpdate(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, - ArgTypes, ArgNames, ArgMappers, AsyncInfo); + ArgTypes, ArgNames, ArgMappers, AI); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = AI.synchronize(true); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -244,12 +247,16 @@ DeviceTy &Device = *PM->Devices[DeviceId]; AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AI = Device.AIM.registerAI(&AsyncInfo); int Rc = target(Loc, Device, HostPtr, Args->NumArgs, Args->ArgBasePtrs, Args->ArgPtrs, Args->ArgSizes, Args->ArgTypes, Args->ArgNames, Args->ArgMappers, NumTeams, ThreadLimit, Args->Tripcount, - IsTeams, AsyncInfo); - if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + IsTeams, AI); + if (Rc == OFFLOAD_SUCCESS) { + /// Synchronization is not forced when the env var is enabled and the target + /// region does not have any memory transfer + Rc = AI.synchronize(); + } handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!"); return OMP_TGT_SUCCESS; Index: openmp/libomptarget/src/omptarget.cpp =================================================================== --- openmp/libomptarget/src/omptarget.cpp +++ openmp/libomptarget/src/omptarget.cpp @@ -18,18 +18,24 @@ #include #include +#include #include using llvm::SmallVector; -int AsyncInfoTy::synchronize() { +bool AsyncFlag = false; + +int AsyncInfoTy::synchronize(bool ForceSync) { int Result = OFFLOAD_SUCCESS; - if (AsyncInfo.Queue) { - // If we have a queue we need to synchronize it now. - Result = Device.synchronize(*this); - assert(AsyncInfo.Queue == nullptr && - "The device plugin should have nulled the queue to indicate there " - "are no outstanding actions!"); + if (!AsyncFlag || ForceSync) { + if (AsyncInfo.Queue) { + DP("Device %d synchronization\n", Device.DeviceID); + // If we have a queue we need to synchronize it now. + Result = Device.synchronize(*this); + assert(AsyncInfo.Queue == nullptr && + "The device plugin should have nulled the queue to indicate there " + "are no outstanding actions!"); + } } return Result; } @@ -39,6 +45,31 @@ return BufferLocations.back(); } +/// Async info manager +AsyncInfoMng::AsyncInfoMng(DeviceTy &Device) : Device(Device) {} + +AsyncInfoMng::~AsyncInfoMng() { + for (const auto &It : AsyncInfoM) + delete (It.second); + AsyncInfoM.clear(); +} + +AsyncInfoTy &AsyncInfoMng::registerAI(AsyncInfoTy *AI) { + if (!AsyncFlag) + return *AI; + + /// Get async info + std::lock_guard MapLock(AsyncMtx); + llvm::DenseMap::iterator it = + AsyncInfoM.find(std::this_thread::get_id()); + if (it != AsyncInfoM.end()) + return *(it->second); + /// Add new Async Info + AsyncInfoTy *AsyncInfo = new AsyncInfoTy(Device, false); + AsyncInfoM.insert(std::make_pair(std::this_thread::get_id(), AsyncInfo)); + return *AsyncInfo; +} + /* All begin addresses for partially mapped structs must be 8-aligned in order * to ensure proper alignment of members. E.g. * @@ -206,7 +237,7 @@ } } // All constructors have been issued, wait for them now. - if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) + if (AsyncInfo.synchronize(true) != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; } Device.HasPendingGlobals = false; @@ -680,7 +711,7 @@ void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { - int Ret; + int Ret = OFFLOAD_SUCCESS; SmallVector PostProcessingPtrs; void *FromMapperBase = nullptr; // process each input. @@ -839,14 +870,6 @@ } } - // TODO: We should not synchronize here but pass the AsyncInfo object to the - // allocate/deallocate device APIs. - // - // We need to synchronize before deallocating data. - Ret = AsyncInfo.synchronize(); - if (Ret != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - // Deallocate target pointer for (PostProcessingInfo &Info : PostProcessingPtrs) { // If we marked the entry to be deleted we need to verify no other thread @@ -857,7 +880,6 @@ LookupResult LR; DeviceTy::HDTTMapAccessorTy HDTTMap = Device.HostDataToTargetMap.getExclusiveAccessor(!Info.DelEntry); - if (Info.DelEntry) { LR = Device.lookupMapping(HDTTMap, Info.HstPtrBegin, Info.DataSize); if (LR.Entry->getTotalRefCount() != 0 || @@ -895,12 +917,11 @@ }; applyToShadowMapEntries(Device, CB, Info.HstPtrBegin, Info.DataSize, Info.TPR); - // If we are deleting the entry the DataMapMtx is locked and we own the // entry. if (Info.DelEntry) { if (!FromMapperBase || FromMapperBase != Info.HstPtrBegin) - Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize); + Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Deallocating data from device failed.\n"); @@ -951,7 +972,7 @@ void **ShadowHstPtrAddr = (void **)Itr->first; // Wait for device-to-host memcopies for whole struct to complete, // before restoring the correct host pointer. - if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) + if (AsyncInfo.synchronize(true) != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; *ShadowHstPtrAddr = Itr->second.HstPtrVal; DP("Restoring original host pointer value " DPxMOD Index: openmp/libomptarget/src/rtl.cpp =================================================================== --- openmp/libomptarget/src/rtl.cpp +++ openmp/libomptarget/src/rtl.cpp @@ -39,6 +39,7 @@ PluginManager *PM; static char *ProfileTraceFile = nullptr; +extern bool AsyncFlag; __attribute__((constructor(101))) void init() { DP("Init target library!\n"); @@ -61,6 +62,11 @@ // TODO: add a configuration option for time granularity if (ProfileTraceFile) timeTraceProfilerInitialize(500 /* us */, "libomptarget"); + + /// Asynchronous execution flag + if (char *EnvStr = getenv("LIBOMPTARGET_INTRA_THREAD_ASYNC")) + AsyncFlag = std::stoi(EnvStr) ? true : false; + DP("Asynchronous execution %s\n", AsyncFlag ? "Enabled" : "Disabled"); } __attribute__((destructor(101))) void deinit() { @@ -535,7 +541,7 @@ // Remove this library's entry from PendingCtorsDtors Device.PendingCtorsDtors.erase(Desc); // All constructors have been issued, wait for them now. - if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) + if (AsyncInfo.synchronize(true) != OFFLOAD_SUCCESS) DP("Failed synchronizing destructors kernels.\n"); } Device.PendingGlobalsMtx.unlock();