Index: openmp/docs/optimizations/OpenMPOpt.rst =================================================================== --- openmp/docs/optimizations/OpenMPOpt.rst +++ openmp/docs/optimizations/OpenMPOpt.rst @@ -100,6 +100,58 @@ inform the user if any globalization calls remain if remarks are enabled. This should be treated as a defect in the program. +.. _Others: + +Others +========= + +.. contents:: + :local: + :depth: 1 + +Automatic Asynchronous Execution of Target Regions +-------------------------------------------------- + +Offloaded regions can be executed synchronously or asynchronously +depending on the pragmas instruction (e.g. nowait) and implementation. +The environment flag `LIBOMPTARGET_ASYNC=1` allows asynchronous +execution of OpenMP Target Regions on Nvidia GPUs by synchronizing +only on memory transfers or at the end of the program when the +OpenMP runtime is torn down. + +.. code-block:: c++ + LIBOMPTARGET_ASYNC=1 ./simple1 //For async execution + +For example: +.. code-block:: c++ + #pragma omp target enter data map(alloc: array[:N]) + #pragma omp target map(tofrom: array[:N]) + busy_device(array); //TT1 + #pragma omp target map(tofrom: array[:N]) + busy_device(array); //TT2 + busy_host(); + #pragma omp target exit data map(from: array[:N]) + +In the example above, the target regions inside the enter/exit data region +do not perform any memory transfer, since the mapped memory is already +on the device. So, the host launches the execution of both target +regions (TT1 and TT2) in the device queue and continues the execution +on its end (busy host). Offloaded regions that trigger memory transfers +will not benefit from this optimization. + + +Limitations: +- Cross-device synchronization primitives or atomics are not allowed. +- Timers wrapping target regions will not show the time it takes to + execute the target task but the time the runtime takes to launch + the execution of the task. +- Synchronization using different threads: Target task and + synchronizations in different threads is not supported yet + when the flag for asynchronous asynchronous execution + is enabled. +- RTL of the target device should allow asynchronous execution and + provide a queue mechanism to execute kernels sequentially. + Resources ========= Index: openmp/libomptarget/include/device.h =================================================================== --- openmp/libomptarget/include/device.h +++ openmp/libomptarget/include/device.h @@ -306,6 +306,8 @@ typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy> PendingCtorsDtorsPerLibrary; +struct AsyncInfoMng; + struct DeviceTy { int32_t DeviceID; RTLInfoTy *RTL; @@ -334,6 +336,10 @@ std::mutex PendingGlobalsMtx, ShadowMtx; + /// Manages AsyncInfo associated to the device and thread + /// depending on the value of the env variable LIBOMPTARGET_ASYNC + AsyncInfoMng AIM; + DeviceTy(RTLInfoTy *RTL); // DeviceTy is not copyable DeviceTy(const DeviceTy &D) = delete; @@ -384,7 +390,8 @@ /// not. It is the caller's responsibility to skip calling this function if /// the map entry is not expected to exist because \p HstPtrBegin uses shared /// memory. - int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size); + int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size, + AsyncInfoTy &AsyncInfo); int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); int disassociatePtr(void *HstPtrBegin); Index: openmp/libomptarget/include/omptarget.h =================================================================== --- openmp/libomptarget/include/omptarget.h +++ openmp/libomptarget/include/omptarget.h @@ -14,11 +14,13 @@ #ifndef _OMPTARGET_H_ #define _OMPTARGET_H_ +#include #include +#include +#include #include #include - -#include +#include #define OFFLOAD_SUCCESS (0) #define OFFLOAD_FAIL (~0) @@ -187,25 +189,51 @@ __tgt_async_info AsyncInfo; DeviceTy &Device; + /// Enables/Disable synchronization in destructor + bool ShouldSyncWhenDestroyed; public: - AsyncInfoTy(DeviceTy &Device) : Device(Device) {} - ~AsyncInfoTy() { synchronize(); } + AsyncInfoTy(DeviceTy &Device, bool ShouldSyncWhenDestroyed = true) + : Device(Device), ShouldSyncWhenDestroyed(ShouldSyncWhenDestroyed) {} + ~AsyncInfoTy() { + if (ShouldSyncWhenDestroyed) + synchronize(); + } /// Implicit conversion to the __tgt_async_info which is used in the /// plugin interface. operator __tgt_async_info *() { return &AsyncInfo; } - /// Synchronize all pending actions. - /// + /// Synchronize all pending actions when the LIBOMPTARGET_ASYNC env var + /// is disabled or when synchronization is forced (ForceSync = true) + /// Otherwise, synchronization is skipped /// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS appropriately. - int synchronize(); + int synchronize(bool ForceSync = false); /// Return a void* reference with a lifetime that is at least as long as this /// AsyncInfoTy object. The location can be used as intermediate buffer. void *&getVoidPtrLocation(); }; +/// This structs allows for automatic asynchronous execution of target regions. +struct AsyncInfoMng { + std::map AsyncInfoM; + DeviceTy *Device; + std::mutex AsyncMtx; + + AsyncInfoMng(DeviceTy *Device); + ~AsyncInfoMng(); + + /// Gets the associated AsyncInfo object based on the value of the + /// LIBOMPTARGET_ASYNC env var. When LIBOMPTARGET_ASYNC is enabled it returns + /// an AsyncInfo object associated to the thread id, otherwise, it returns + /// a new object. + AsyncInfoTy &get(); + + /// Frees the input AsyncInfo object when LIBOMPTARGET_ASYNC is disabled + void free(AsyncInfoTy &AsyncInfo); +}; + /// This struct is a record of non-contiguous information struct __tgt_target_non_contig { uint64_t Offset; Index: openmp/libomptarget/src/device.cpp =================================================================== --- openmp/libomptarget/src/device.cpp +++ openmp/libomptarget/src/device.cpp @@ -52,7 +52,7 @@ DeviceTy::DeviceTy(RTLInfoTy *RTL) : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), HasPendingGlobals(false), PendingCtorsDtors(), ShadowPtrMap(), - PendingGlobalsMtx(), ShadowMtx() {} + PendingGlobalsMtx(), ShadowMtx(), AIM(this) {} DeviceTy::~DeviceTy() { if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)) @@ -438,7 +438,7 @@ } int DeviceTy::deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, - int64_t Size) { + int64_t Size, AsyncInfoTy &AsyncInfo) { // Check if the pointer is contained in any sub-nodes. if (!(LR.Flags.IsContained || LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter)) { @@ -453,7 +453,11 @@ assert(HT.getTotalRefCount() == 0 && HT.getDeleteThreadId() == std::this_thread::get_id() && "Trying to delete entry that is in use or owned by another thread."); - + // Synchronization + int Ret = AsyncInfo.synchronize(); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + // Delete tgt data DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n", DPxPTR(HT.TgtPtrBegin), Size); deleteData((void *)HT.TgtPtrBegin); @@ -466,7 +470,7 @@ HDTTMap->erase(LR.Entry); delete LR.Entry; - int Ret = OFFLOAD_SUCCESS; + Ret = OFFLOAD_SUCCESS; if (Event && destroyEvent(Event) != OFFLOAD_SUCCESS) { REPORT("Failed to destroy event " DPxMOD "\n", DPxPTR(Event)); Ret = OFFLOAD_FAIL; Index: openmp/libomptarget/src/interface.cpp =================================================================== --- openmp/libomptarget/src/interface.cpp +++ openmp/libomptarget/src/interface.cpp @@ -92,11 +92,12 @@ } #endif - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = Device.AIM.get(); int Rc = targetDataBegin(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = AsyncInfo.synchronize(true); + Device.AIM.free(AsyncInfo); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -141,11 +142,12 @@ } #endif - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = Device.AIM.get(); int Rc = targetDataEnd(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = AsyncInfo.synchronize(true); + Device.AIM.free(AsyncInfo); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -178,11 +180,12 @@ "Updating OpenMP data"); DeviceTy &Device = *PM->Devices[DeviceId]; - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = Device.AIM.get(); int Rc = targetDataUpdate(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = AsyncInfo.synchronize(true); + Device.AIM.free(AsyncInfo); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -243,13 +246,17 @@ NumTeams = 0; DeviceTy &Device = *PM->Devices[DeviceId]; - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = Device.AIM.get(); int Rc = target(Loc, Device, HostPtr, Args->NumArgs, Args->ArgBasePtrs, Args->ArgPtrs, Args->ArgSizes, Args->ArgTypes, Args->ArgNames, Args->ArgMappers, NumTeams, ThreadLimit, Args->Tripcount, IsTeams, AsyncInfo); - if (Rc == OFFLOAD_SUCCESS) + if (Rc == OFFLOAD_SUCCESS) { + /// Synchronization is not forced when the env var is enabled and the target + /// region does not have any memory transfer Rc = AsyncInfo.synchronize(); + } + Device.AIM.free(AsyncInfo); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!"); return OMP_TGT_SUCCESS; Index: openmp/libomptarget/src/omptarget.cpp =================================================================== --- openmp/libomptarget/src/omptarget.cpp +++ openmp/libomptarget/src/omptarget.cpp @@ -18,18 +18,24 @@ #include #include +#include #include using llvm::SmallVector; -int AsyncInfoTy::synchronize() { +bool AsyncFlag = false; + +int AsyncInfoTy::synchronize(bool ForceSync) { int Result = OFFLOAD_SUCCESS; - if (AsyncInfo.Queue) { - // If we have a queue we need to synchronize it now. - Result = Device.synchronize(*this); - assert(AsyncInfo.Queue == nullptr && - "The device plugin should have nulled the queue to indicate there " - "are no outstanding actions!"); + if (!AsyncFlag || ForceSync) { + if (AsyncInfo.Queue) { + DP("Device %d synchronization\n", Device.DeviceID); + // If we have a queue we need to synchronize it now. + Result = Device.synchronize(*this); + assert(AsyncInfo.Queue == nullptr && + "The device plugin should have nulled the queue to indicate there " + "are no outstanding actions!"); + } } return Result; } @@ -39,6 +45,38 @@ return BufferLocations.back(); } +/// Async info manager +AsyncInfoMng::AsyncInfoMng(DeviceTy *Device) : Device(Device) { } + +AsyncInfoMng::~AsyncInfoMng() { + for (const auto &It : AsyncInfoM) + delete(It.second); + AsyncInfoM.clear(); +} + +AsyncInfoTy &AsyncInfoMng::get() { + if (!AsyncFlag) + return *(new AsyncInfoTy(*Device)); + + /// Get async info + std::lock_guard MapLock(AsyncMtx); + std::map::iterator it = + AsyncInfoM.find(std::this_thread::get_id()); + if (it != AsyncInfoM.end()) + return *(it->second); + + /// Add new Async Info + AsyncInfoTy *AsyncInfo = new AsyncInfoTy(*Device, false); + AsyncInfoM.insert(it, std::make_pair(std::this_thread::get_id(), AsyncInfo)); + return *AsyncInfo; +} + +void AsyncInfoMng::free(AsyncInfoTy &AsyncInfo) { + if (!AsyncFlag) + delete &AsyncInfo; +} + + /* All begin addresses for partially mapped structs must be 8-aligned in order * to ensure proper alignment of members. E.g. * @@ -680,7 +718,7 @@ void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { - int Ret; + int Ret = OFFLOAD_SUCCESS; SmallVector PostProcessingPtrs; void *FromMapperBase = nullptr; // process each input. @@ -839,14 +877,6 @@ } } - // TODO: We should not synchronize here but pass the AsyncInfo object to the - // allocate/deallocate device APIs. - // - // We need to synchronize before deallocating data. - Ret = AsyncInfo.synchronize(); - if (Ret != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - // Deallocate target pointer for (PostProcessingInfo &Info : PostProcessingPtrs) { // If we marked the entry to be deleted we need to verify no other thread @@ -857,7 +887,6 @@ LookupResult LR; DeviceTy::HDTTMapAccessorTy HDTTMap = Device.HostDataToTargetMap.getExclusiveAccessor(!Info.DelEntry); - if (Info.DelEntry) { LR = Device.lookupMapping(HDTTMap, Info.HstPtrBegin, Info.DataSize); if (LR.Entry->getTotalRefCount() != 0 || @@ -895,12 +924,11 @@ }; applyToShadowMapEntries(Device, CB, Info.HstPtrBegin, Info.DataSize, Info.TPR); - // If we are deleting the entry the DataMapMtx is locked and we own the // entry. if (Info.DelEntry) { if (!FromMapperBase || FromMapperBase != Info.HstPtrBegin) - Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize); + Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Deallocating data from device failed.\n"); Index: openmp/libomptarget/src/rtl.cpp =================================================================== --- openmp/libomptarget/src/rtl.cpp +++ openmp/libomptarget/src/rtl.cpp @@ -39,6 +39,7 @@ PluginManager *PM; static char *ProfileTraceFile = nullptr; +extern bool AsyncFlag; __attribute__((constructor(101))) void init() { DP("Init target library!\n"); @@ -61,6 +62,11 @@ // TODO: add a configuration option for time granularity if (ProfileTraceFile) timeTraceProfilerInitialize(500 /* us */, "libomptarget"); + + /// Asynchronous execution flag + if (char *EnvStr = getenv("LIBOMPTARGET_ASYNC")) + AsyncFlag = std::stoi(EnvStr) ? true : false; + DP("Asynchronous execution %s\n", AsyncFlag ? "Enabled" : "Disabled"); } __attribute__((destructor(101))) void deinit() {