Index: openmp/docs/optimizations/OpenMPOpt.rst =================================================================== --- openmp/docs/optimizations/OpenMPOpt.rst +++ openmp/docs/optimizations/OpenMPOpt.rst @@ -100,6 +100,58 @@ inform the user if any globalization calls remain if remarks are enabled. This should be treated as a defect in the program. +.. _Others: + +Others +========= + +.. contents:: + :local: + :depth: 1 + +Automatic Asynchronous Execution of Target Regions +-------------------------------------------------- + +Offloaded regions can be executed synchronously or asynchronously +depending on the pragmas instruction (e.g. nowait) and implementation. +The environment flag `LIBOMPTARGET_ASYNC=1` allows asynchronous +execution of OpenMP Target Regions on Nvidia GPUs by synchronizing +only on memory transfers or at the end of the program when the +OpenMP runtime is torn down. + +.. code-block:: c++ + LIBOMPTARGET_ASYNC=1 ./simple1 //For async execution + +For example: +.. code-block:: c++ + #pragma omp target enter data map(alloc: array[:N]) + #pragma omp target map(tofrom: array[:N]) + busy_device(array); //TT1 + #pragma omp target map(tofrom: array[:N]) + busy_device(array); //TT2 + busy_host(); + #pragma omp target exit data map(from: array[:N]) + +In the example above, the target regions inside the enter/exit data region +do not perform any memory transfer, since the mapped memory is already +on the device. So, the host launches the execution of both target +regions (TT1 and TT2) in the device queue and continues the execution +on its end (busy host). Offloaded regions that trigger memory transfers +will not benefit from this optimization. + + +Limitations: +- Cross-device synchronization primitives or atomics are not allowed. +- Timers wrapping target regions will not show the time it takes to + execute the target task but the time the runtime takes to launch + the execution of the task. +- Synchronization using different threads: Target task and + synchronizations in different threads is not supported yet + when the flag for asynchronous asynchronous execution + is enabled. +- RTL of the target device should allow asynchronous execution and + provide a queue mechanism to execute kernels sequentially. + Resources ========= Index: openmp/libomptarget/include/device.h =================================================================== --- openmp/libomptarget/include/device.h +++ openmp/libomptarget/include/device.h @@ -306,6 +306,8 @@ typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy> PendingCtorsDtorsPerLibrary; +struct AsyncInfoMng; + struct DeviceTy { int32_t DeviceID; RTLInfoTy *RTL; @@ -334,6 +336,9 @@ std::mutex PendingGlobalsMtx, ShadowMtx; + /// Controls whether to synchronize the device or not + AsyncInfoMng AIM; + DeviceTy(RTLInfoTy *RTL); // DeviceTy is not copyable DeviceTy(const DeviceTy &D) = delete; @@ -384,7 +389,8 @@ /// not. It is the caller's responsibility to skip calling this function if /// the map entry is not expected to exist because \p HstPtrBegin uses shared /// memory. - int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size); + int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size, + AsyncInfoTy &AsyncInfo); int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); int disassociatePtr(void *HstPtrBegin); Index: openmp/libomptarget/include/omptarget.h =================================================================== --- openmp/libomptarget/include/omptarget.h +++ openmp/libomptarget/include/omptarget.h @@ -14,11 +14,13 @@ #ifndef _OMPTARGET_H_ #define _OMPTARGET_H_ +#include #include +#include +#include #include #include - -#include +#include #define OFFLOAD_SUCCESS (0) #define OFFLOAD_FAIL (~0) @@ -187,10 +189,16 @@ __tgt_async_info AsyncInfo; DeviceTy &Device; + /// Enables/Disable synchronization in destructor + bool ShouldSync; public: - AsyncInfoTy(DeviceTy &Device) : Device(Device) {} - ~AsyncInfoTy() { synchronize(); } + AsyncInfoTy(DeviceTy &Device, bool ShouldSync = true) + : Device(Device), ShouldSync(ShouldSync) {} + ~AsyncInfoTy() { + if (ShouldSync) + synchronize(); + } /// Implicit conversion to the __tgt_async_info which is used in the /// plugin interface. @@ -206,6 +214,29 @@ void *&getVoidPtrLocation(); }; +/// This structs allows for automatic asynchronous execution of target regions. +/// It controls whether to synchronize or not based on the value of AsyncFlag. +struct AsyncInfoMng { + std::map AsyncInfoM; + DeviceTy *Device; + bool AsyncFlag; + std::mutex AsyncMtx; + + AsyncInfoMng(DeviceTy *Device); + ~AsyncInfoMng(); + + /// Get async info object + AsyncInfoTy *get(); + + /// It synchronizes the AsyncInfo object only when the AsyncFlag + /// is disabled or when synchronization is forced (ForceSync = true). + /// Otherwise, synchronization is skipped + int synchronize(AsyncInfoTy &AsyncFlag, bool ForceSync); + + /// Free AsyncInfo + void free(); +}; + /// This struct is a record of non-contiguous information struct __tgt_target_non_contig { uint64_t Offset; Index: openmp/libomptarget/src/device.cpp =================================================================== --- openmp/libomptarget/src/device.cpp +++ openmp/libomptarget/src/device.cpp @@ -52,7 +52,7 @@ DeviceTy::DeviceTy(RTLInfoTy *RTL) : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), HasPendingGlobals(false), PendingCtorsDtors(), ShadowPtrMap(), - PendingGlobalsMtx(), ShadowMtx() {} + PendingGlobalsMtx(), ShadowMtx(), AIM(this) {} DeviceTy::~DeviceTy() { if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)) @@ -438,7 +438,7 @@ } int DeviceTy::deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, - int64_t Size) { + int64_t Size, AsyncInfoTy &AsyncInfo) { // Check if the pointer is contained in any sub-nodes. if (!(LR.Flags.IsContained || LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter)) { @@ -453,7 +453,11 @@ assert(HT.getTotalRefCount() == 0 && HT.getDeleteThreadId() == std::this_thread::get_id() && "Trying to delete entry that is in use or owned by another thread."); - + // Synchronization + int Ret = AsyncInfo.synchronize(); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + // Delete tgt data DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n", DPxPTR(HT.TgtPtrBegin), Size); deleteData((void *)HT.TgtPtrBegin); @@ -466,7 +470,7 @@ HDTTMap->erase(LR.Entry); delete LR.Entry; - int Ret = OFFLOAD_SUCCESS; + Ret = OFFLOAD_SUCCESS; if (Event && destroyEvent(Event) != OFFLOAD_SUCCESS) { REPORT("Failed to destroy event " DPxMOD "\n", DPxPTR(Event)); Ret = OFFLOAD_FAIL; Index: openmp/libomptarget/src/interface.cpp =================================================================== --- openmp/libomptarget/src/interface.cpp +++ openmp/libomptarget/src/interface.cpp @@ -92,11 +92,11 @@ } #endif - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = *Device.AIM.get(); int Rc = targetDataBegin(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = Device.AIM.synchronize(AsyncInfo, true); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -141,11 +141,11 @@ } #endif - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = *Device.AIM.get(); int Rc = targetDataEnd(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = Device.AIM.synchronize(AsyncInfo, true); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -178,11 +178,11 @@ "Updating OpenMP data"); DeviceTy &Device = *PM->Devices[DeviceId]; - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = *Device.AIM.get(); int Rc = targetDataUpdate(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = Device.AIM.synchronize(AsyncInfo, true); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -243,13 +243,16 @@ NumTeams = 0; DeviceTy &Device = *PM->Devices[DeviceId]; - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = *Device.AIM.get(); int Rc = target(Loc, Device, HostPtr, Args->NumArgs, Args->ArgBasePtrs, Args->ArgPtrs, Args->ArgSizes, Args->ArgTypes, Args->ArgNames, Args->ArgMappers, NumTeams, ThreadLimit, Args->Tripcount, IsTeams, AsyncInfo); - if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + if (Rc == OFFLOAD_SUCCESS) { + /// Synchronization is not forced when the env var is enabled and the target + /// region does not have any memory transfer + Rc = Device.AIM.synchronize(AsyncInfo, false); + } handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!"); return OMP_TGT_SUCCESS; Index: openmp/libomptarget/src/omptarget.cpp =================================================================== --- openmp/libomptarget/src/omptarget.cpp +++ openmp/libomptarget/src/omptarget.cpp @@ -18,6 +18,7 @@ #include #include +#include #include using llvm::SmallVector; @@ -25,6 +26,7 @@ int AsyncInfoTy::synchronize() { int Result = OFFLOAD_SUCCESS; if (AsyncInfo.Queue) { + DP("Device %d synchronization\n", Device.DeviceID); // If we have a queue we need to synchronize it now. Result = Device.synchronize(*this); assert(AsyncInfo.Queue == nullptr && @@ -39,6 +41,61 @@ return BufferLocations.back(); } +/// Async info manager +AsyncInfoMng::AsyncInfoMng(DeviceTy *Device) { + if (char *EnvStr = getenv("LIBOMPTARGET_ASYNC")) + AsyncFlag = std::stoi(EnvStr) ? true : false; + else + AsyncFlag = false; + DP("Asynchronous execution %s\n", AsyncFlag ? "Enabled" : "Disabled"); + this->Device = Device; +} + +AsyncInfoMng::~AsyncInfoMng() { + std::map::iterator it; + for (it = AsyncInfoM.begin(); it != AsyncInfoM.end(); it++) + delete (it->second); + AsyncInfoM.clear(); +} + +AsyncInfoTy *AsyncInfoMng::get() { + if (!AsyncFlag) + return new AsyncInfoTy(*Device); + + /// Get async info + std::lock_guard MapLock(AsyncMtx); + std::map::iterator it = + AsyncInfoM.find(std::this_thread::get_id()); + if (it != AsyncInfoM.end()) + return (it->second); + + /// Add new Async Info + AsyncInfoTy *AsyncInfo = new AsyncInfoTy(*Device, false); + AsyncInfoM.insert(it, std::make_pair(std::this_thread::get_id(), AsyncInfo)); + return AsyncInfo; +} + +int AsyncInfoMng::synchronize(AsyncInfoTy &AsyncInfo, bool ForceSync) { + int Rc = OFFLOAD_SUCCESS; + if (!AsyncFlag) { + Rc = AsyncInfo.synchronize(); + delete &AsyncInfo; + } else if (ForceSync) { + Rc = AsyncInfo.synchronize(); + } + return Rc; +} + +void AsyncInfoMng::free() { + if (AsyncFlag) { + std::lock_guard MapLock(AsyncMtx); + std::map::iterator it = + AsyncInfoM.find(std::this_thread::get_id()); + delete it->second; + AsyncInfoM.erase(it); + } +} + /* All begin addresses for partially mapped structs must be 8-aligned in order * to ensure proper alignment of members. E.g. * @@ -680,7 +737,7 @@ void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { - int Ret; + int Ret = OFFLOAD_SUCCESS; SmallVector PostProcessingPtrs; void *FromMapperBase = nullptr; // process each input. @@ -839,14 +896,6 @@ } } - // TODO: We should not synchronize here but pass the AsyncInfo object to the - // allocate/deallocate device APIs. - // - // We need to synchronize before deallocating data. - Ret = AsyncInfo.synchronize(); - if (Ret != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - // Deallocate target pointer for (PostProcessingInfo &Info : PostProcessingPtrs) { // If we marked the entry to be deleted we need to verify no other thread @@ -857,7 +906,6 @@ LookupResult LR; DeviceTy::HDTTMapAccessorTy HDTTMap = Device.HostDataToTargetMap.getExclusiveAccessor(!Info.DelEntry); - if (Info.DelEntry) { LR = Device.lookupMapping(HDTTMap, Info.HstPtrBegin, Info.DataSize); if (LR.Entry->getTotalRefCount() != 0 || @@ -895,12 +943,11 @@ }; applyToShadowMapEntries(Device, CB, Info.HstPtrBegin, Info.DataSize, Info.TPR); - // If we are deleting the entry the DataMapMtx is locked and we own the // entry. if (Info.DelEntry) { if (!FromMapperBase || FromMapperBase != Info.HstPtrBegin) - Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize); + Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Deallocating data from device failed.\n");