Index: openmp/docs/optimizations/OpenMPOpt.rst =================================================================== --- openmp/docs/optimizations/OpenMPOpt.rst +++ openmp/docs/optimizations/OpenMPOpt.rst @@ -100,6 +100,47 @@ inform the user if any globalization calls remain if remarks are enabled. This should be treated as a defect in the program. +.. _Others: + +Others +========= + +.. contents:: + :local: + :depth: 1 + +Automatic Asynchronous Execution of Target Regions +--------------------------------- + +By default, offloaded regions are executed synchronously, +thus the host thread blocks until their completion. +By using the enviroment flag `LIBOMPTARGET_ASYNC=1` the implicit +barrier that exists at the end of every target region is removed. + +.. code-block:: c++ + LIBOMPTARGET_ASYNC=1 ./simple1 //For async execution + +Limitations: +- It is necessary to define host and target data environments + (e.g. `#pragma omp target data map`). For example: + + .. code-block:: c++ + #pragma omp target enter data map(alloc: array[:N]) + #pragma omp target map(tofrom: array[:N]) + busy_device(array); + #pragma omp target map(tofrom: array[:N]) + busy_device(array); + busy_host(); + #pragma omp target exit data map(from: array[:N]) +- Cross-device synchronization primitives or atomics are not allowed. +- Timers wrapping target regions will not show the time it takes to + execute the target task but the time the runtime takes to launch + the execution of the task. +- Synchronization using different threads: Target task and + synchronizations in different threads is not supported yet + when the flag for asynchronous asynchronous execution + is enabled. + Resources ========= Index: openmp/libomptarget/include/device.h =================================================================== --- openmp/libomptarget/include/device.h +++ openmp/libomptarget/include/device.h @@ -306,6 +306,8 @@ typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy> PendingCtorsDtorsPerLibrary; +struct AsyncInfoMng; + struct DeviceTy { int32_t DeviceID; RTLInfoTy *RTL; @@ -384,7 +386,8 @@ /// not. It is the caller's responsibility to skip calling this function if /// the map entry is not expected to exist because \p HstPtrBegin uses shared /// memory. - int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size); + int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size, + AsyncInfoTy &AsyncInfo); int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); int disassociatePtr(void *HstPtrBegin); Index: openmp/libomptarget/include/omptarget.h =================================================================== --- openmp/libomptarget/include/omptarget.h +++ openmp/libomptarget/include/omptarget.h @@ -17,7 +17,8 @@ #include #include #include - +#include +#include #include #define OFFLOAD_SUCCESS (0) @@ -187,10 +188,16 @@ __tgt_async_info AsyncInfo; DeviceTy &Device; + // Enables/Disable synchronization in destructor + bool ShouldSync; public: - AsyncInfoTy(DeviceTy &Device) : Device(Device) {} - ~AsyncInfoTy() { synchronize(); } + AsyncInfoTy(DeviceTy &Device, bool ShouldSync = true) + : Device(Device), ShouldSync(ShouldSync) {} + ~AsyncInfoTy() { + if(ShouldSync) + synchronize(); + } /// Implicit conversion to the __tgt_async_info which is used in the /// plugin interface. @@ -206,6 +213,24 @@ void *&getVoidPtrLocation(); }; +/// This structs allows for automatic asynchronous execution of target regions. +/// It controls whether to synchronize or not based on the value of AsyncFlag. +struct AsyncInfoMng { + static thread_local std::vector> AsyncInfoV; + bool AsyncFlag; + + AsyncInfoMng(); + + // Get async info object + AsyncInfoTy *get(DeviceTy &device); + + // Synchronize asyncinfo + int synchronize(AsyncInfoTy &AsyncInfo, bool ForceSync = false); + + // Free asyncinfo + void free(DeviceTy &device); +}; + /// This struct is a record of non-contiguous information struct __tgt_target_non_contig { uint64_t Offset; Index: openmp/libomptarget/src/device.cpp =================================================================== --- openmp/libomptarget/src/device.cpp +++ openmp/libomptarget/src/device.cpp @@ -49,6 +49,7 @@ return OFFLOAD_SUCCESS; } +// Device DeviceTy::DeviceTy(RTLInfoTy *RTL) : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), HasPendingGlobals(false), PendingCtorsDtors(), ShadowPtrMap(), @@ -438,7 +439,7 @@ } int DeviceTy::deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, - int64_t Size) { + int64_t Size, AsyncInfoTy &AsyncInfo) { // Check if the pointer is contained in any sub-nodes. if (!(LR.Flags.IsContained || LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter)) { @@ -453,7 +454,11 @@ assert(HT.getTotalRefCount() == 0 && HT.getDeleteThreadId() == std::this_thread::get_id() && "Trying to delete entry that is in use or owned by another thread."); - + // Do synchronization + int Ret = AsyncInfo.synchronize(); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + // Delete tgt data DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n", DPxPTR(HT.TgtPtrBegin), Size); deleteData((void *)HT.TgtPtrBegin); @@ -466,7 +471,7 @@ HDTTMap->erase(LR.Entry); delete LR.Entry; - int Ret = OFFLOAD_SUCCESS; + Ret = OFFLOAD_SUCCESS; if (Event && destroyEvent(Event) != OFFLOAD_SUCCESS) { REPORT("Failed to destroy event " DPxMOD "\n", DPxPTR(Event)); Ret = OFFLOAD_FAIL; Index: openmp/libomptarget/src/interface.cpp =================================================================== --- openmp/libomptarget/src/interface.cpp +++ openmp/libomptarget/src/interface.cpp @@ -21,6 +21,8 @@ #include #include +AsyncInfoMng AIM; + //////////////////////////////////////////////////////////////////////////////// /// adds requires flags EXTERN void __tgt_register_requires(int64_t Flags) { @@ -92,11 +94,11 @@ } #endif - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = *AIM.get(Device); int Rc = targetDataBegin(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = AIM.synchronize(AsyncInfo, true); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -141,11 +143,11 @@ } #endif - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = *AIM.get(Device); int Rc = targetDataEnd(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = AIM.synchronize(AsyncInfo, true); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -178,11 +180,11 @@ "Updating OpenMP data"); DeviceTy &Device = *PM->Devices[DeviceId]; - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = *AIM.get(Device); int Rc = targetDataUpdate(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = AIM.synchronize(AsyncInfo, true); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -243,13 +245,13 @@ NumTeams = 0; DeviceTy &Device = *PM->Devices[DeviceId]; - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = *AIM.get(Device); int Rc = target(Loc, Device, HostPtr, Args->NumArgs, Args->ArgBasePtrs, Args->ArgPtrs, Args->ArgSizes, Args->ArgTypes, Args->ArgNames, Args->ArgMappers, NumTeams, ThreadLimit, Args->Tripcount, IsTeams, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = AIM.synchronize(AsyncInfo); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!"); return OMP_TGT_SUCCESS; Index: openmp/libomptarget/src/omptarget.cpp =================================================================== --- openmp/libomptarget/src/omptarget.cpp +++ openmp/libomptarget/src/omptarget.cpp @@ -25,6 +25,7 @@ int AsyncInfoTy::synchronize() { int Result = OFFLOAD_SUCCESS; if (AsyncInfo.Queue) { + DP("Device synchronization\n"); // If we have a queue we need to synchronize it now. Result = Device.synchronize(*this); assert(AsyncInfo.Queue == nullptr && @@ -39,6 +40,52 @@ return BufferLocations.back(); } +// Async info manager +thread_local std::vector> AsyncInfoMng::AsyncInfoV; + +AsyncInfoMng::AsyncInfoMng() { + if (char *EnvStr = getenv("LIBOMPTARGET_ASYNC")) + AsyncFlag = std::stoi(EnvStr) ? true : false; + else + AsyncFlag = false; + DP("Asynchronous execution %s\n", AsyncFlag ? "Enabled" : "Disabled"); +} + +AsyncInfoTy *AsyncInfoMng::get(DeviceTy &device) { + if (!AsyncFlag) + return new AsyncInfoTy(device); + + //Async execution + if (AsyncInfoV.empty()) { + auto num_devices = omp_get_num_devices(); + AsyncInfoV.reserve(num_devices); + for (auto i = 0; i < num_devices; i++) + AsyncInfoV.push_back(nullptr); + } + // Get async info + if (!AsyncInfoV[device.DeviceID]) + AsyncInfoV[device.DeviceID] = std::make_unique(device, false); + return AsyncInfoV[device.DeviceID].get(); +} + +int AsyncInfoMng::synchronize(AsyncInfoTy &AsyncInfo, bool ForceSync) { + int Rc = OFFLOAD_SUCCESS; + if (!AsyncFlag) { + Rc = AsyncInfo.synchronize(); + delete &AsyncInfo; + } else if (ForceSync) { + Rc = AsyncInfo.synchronize(); + } + return Rc; +} + +void AsyncInfoMng::free(DeviceTy &device) { + if (AsyncFlag) { + AsyncInfoV[device.DeviceID].reset(); + AsyncInfoV[device.DeviceID] = nullptr; + } +} + /* All begin addresses for partially mapped structs must be 8-aligned in order * to ensure proper alignment of members. E.g. * @@ -680,7 +727,7 @@ void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { - int Ret; + int Ret = OFFLOAD_SUCCESS; SmallVector PostProcessingPtrs; void *FromMapperBase = nullptr; // process each input. @@ -839,14 +886,6 @@ } } - // TODO: We should not synchronize here but pass the AsyncInfo object to the - // allocate/deallocate device APIs. - // - // We need to synchronize before deallocating data. - Ret = AsyncInfo.synchronize(); - if (Ret != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - // Deallocate target pointer for (PostProcessingInfo &Info : PostProcessingPtrs) { // If we marked the entry to be deleted we need to verify no other thread @@ -857,7 +896,6 @@ LookupResult LR; DeviceTy::HDTTMapAccessorTy HDTTMap = Device.HostDataToTargetMap.getExclusiveAccessor(!Info.DelEntry); - if (Info.DelEntry) { LR = Device.lookupMapping(HDTTMap, Info.HstPtrBegin, Info.DataSize); if (LR.Entry->getTotalRefCount() != 0 || @@ -895,12 +933,11 @@ }; applyToShadowMapEntries(Device, CB, Info.HstPtrBegin, Info.DataSize, Info.TPR); - // If we are deleting the entry the DataMapMtx is locked and we own the // entry. if (Info.DelEntry) { if (!FromMapperBase || FromMapperBase != Info.HstPtrBegin) - Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize); + Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Deallocating data from device failed.\n");