Index: openmp/libomptarget/include/device.h =================================================================== --- openmp/libomptarget/include/device.h +++ openmp/libomptarget/include/device.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "ExclusiveAccess.h" #include "omptarget.h" @@ -306,6 +307,8 @@ typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy> PendingCtorsDtorsPerLibrary; +struct AsyncInfoMng; + struct DeviceTy { int32_t DeviceID; RTLInfoTy *RTL; @@ -338,6 +341,9 @@ // moved into the target task in libomp. std::map LoopTripCnt; + // Asyncinfo manager + static AsyncInfoMng AsyncInfoM; + DeviceTy(RTLInfoTy *RTL); // DeviceTy is not copyable DeviceTy(const DeviceTy &D) = delete; @@ -345,6 +351,11 @@ ~DeviceTy(); + // Asyncinfo + AsyncInfoTy *getAsyncInfo(); + int syncAsyncInfo(AsyncInfoTy &AsyncInfo, bool ForceSync = false); + void freeAsyncInfo(); + // Return true if data can be copied to DstDevice directly bool isDataExchangable(const DeviceTy &DstDevice); @@ -388,7 +399,8 @@ /// not. It is the caller's responsibility to skip calling this function if /// the map entry is not expected to exist because \p HstPtrBegin uses shared /// memory. - int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size); + int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size, + AsyncInfoTy &AsyncInfo); int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); int disassociatePtr(void *HstPtrBegin); @@ -469,6 +481,22 @@ void deinit(); }; +struct AsyncInfoMng { + static thread_local std::vector> AsyncInfoV; + bool AsyncFlag; + + AsyncInfoMng(); + + // Get async info object + AsyncInfoTy *get(DeviceTy &device); + + // Synchronize asyncinfo + int synchronize(AsyncInfoTy &AsyncInfo, bool ForceSync = false); + + // Free asyncinfo + void free(DeviceTy &device); +}; + extern bool deviceIsReady(int DeviceNum); /// Struct for the data required to handle plugins Index: openmp/libomptarget/include/omptarget.h =================================================================== --- openmp/libomptarget/include/omptarget.h +++ openmp/libomptarget/include/omptarget.h @@ -190,7 +190,7 @@ public: AsyncInfoTy(DeviceTy &Device) : Device(Device) {} - ~AsyncInfoTy() { synchronize(); } + ~AsyncInfoTy() {} /// Implicit conversion to the __tgt_async_info which is used in the /// plugin interface. Index: openmp/libomptarget/src/device.cpp =================================================================== --- openmp/libomptarget/src/device.cpp +++ openmp/libomptarget/src/device.cpp @@ -49,6 +49,61 @@ return OFFLOAD_SUCCESS; } +// Async info manager +thread_local std::vector> AsyncInfoMng::AsyncInfoV; + +AsyncInfoMng::AsyncInfoMng() { + if (char *EnvStr = getenv("LIBOMPTARGET_ASYNC")) + AsyncFlag = std::stoi(EnvStr) ? true : false; + else + AsyncFlag = false; + + if (AsyncFlag) { + DP("Asynchronous execution enabled\n"); + } else { + DP("Asynchronous execution disabled\n"); + } +} + +AsyncInfoTy *AsyncInfoMng::get(DeviceTy &device) { + if (AsyncFlag) { + // Set AsyncInfoV + if (AsyncInfoV.empty()) { + auto num_devices = omp_get_num_devices(); + AsyncInfoV.reserve(num_devices); + for (auto i = 0; i < num_devices; i++) + AsyncInfoV.push_back(nullptr); + } + // Get async info + if (!AsyncInfoV[device.DeviceID]) + AsyncInfoV[device.DeviceID] = std::make_unique(device); + return AsyncInfoV[device.DeviceID].get(); + } else { + return new AsyncInfoTy(device); + } +} + +int AsyncInfoMng::synchronize(AsyncInfoTy &AsyncInfo, bool ForceSync) { + int Rc = OFFLOAD_SUCCESS; + if (!AsyncFlag) { + Rc = AsyncInfo.synchronize(); + delete &AsyncInfo; + } else if (ForceSync) { + Rc = AsyncInfo.synchronize(); + } + return Rc; +} + +void AsyncInfoMng::free(DeviceTy &device) { + if (AsyncFlag) { + AsyncInfoV[device.DeviceID].reset(); + AsyncInfoV[device.DeviceID] = nullptr; + } +} + +// Device +AsyncInfoMng DeviceTy::AsyncInfoM; + DeviceTy::DeviceTy(RTLInfoTy *RTL) : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), HasPendingGlobals(false), PendingCtorsDtors(), ShadowPtrMap(), @@ -62,6 +117,14 @@ dumpTargetPointerMappings(&Loc, *this); } +AsyncInfoTy *DeviceTy::getAsyncInfo() { return AsyncInfoM.get(*this); }; + +int DeviceTy::syncAsyncInfo(AsyncInfoTy &AsyncInfo, bool ForceSync) { + return AsyncInfoM.synchronize(AsyncInfo, ForceSync); +}; + +void DeviceTy::freeAsyncInfo() { AsyncInfoM.free(*this); }; + int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) { HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor(); @@ -438,7 +501,7 @@ } int DeviceTy::deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, - int64_t Size) { + int64_t Size, AsyncInfoTy &AsyncInfo) { // Check if the pointer is contained in any sub-nodes. if (!(LR.Flags.IsContained || LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter)) { @@ -453,7 +516,11 @@ assert(HT.getTotalRefCount() == 0 && HT.getDeleteThreadId() == std::this_thread::get_id() && "Trying to delete entry that is in use or owned by another thread."); - + // Do synchronization + int Ret = AsyncInfo.synchronize(); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + // Delete tgt data DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n", DPxPTR(HT.TgtPtrBegin), Size); deleteData((void *)HT.TgtPtrBegin); @@ -466,7 +533,7 @@ HDTTMap->erase(LR.Entry); delete LR.Entry; - int Ret = OFFLOAD_SUCCESS; + Ret = OFFLOAD_SUCCESS; if (Event && destroyEvent(Event) != OFFLOAD_SUCCESS) { REPORT("Failed to destroy event " DPxMOD "\n", DPxPTR(Event)); Ret = OFFLOAD_FAIL; Index: openmp/libomptarget/src/interface.cpp =================================================================== --- openmp/libomptarget/src/interface.cpp +++ openmp/libomptarget/src/interface.cpp @@ -92,11 +92,11 @@ } #endif - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = *Device.getAsyncInfo(); int Rc = targetDataBegin(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = Device.syncAsyncInfo(AsyncInfo, true); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -141,11 +141,11 @@ } #endif - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = *Device.getAsyncInfo(); int Rc = targetDataEnd(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = Device.syncAsyncInfo(AsyncInfo, true); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -178,11 +178,11 @@ "Updating OpenMP data"); DeviceTy &Device = *PM->Devices[DeviceId]; - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = *Device.getAsyncInfo(); int Rc = targetDataUpdate(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = Device.syncAsyncInfo(AsyncInfo, true); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } @@ -243,13 +243,13 @@ NumTeams = 0; DeviceTy &Device = *PM->Devices[DeviceId]; - AsyncInfoTy AsyncInfo(Device); + AsyncInfoTy &AsyncInfo = *Device.getAsyncInfo(); int Rc = target(Loc, Device, HostPtr, Args->NumArgs, Args->ArgBasePtrs, Args->ArgPtrs, Args->ArgSizes, Args->ArgTypes, Args->ArgNames, Args->ArgMappers, NumTeams, ThreadLimit, Args->Tripcount, IsTeams, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + Rc = Device.syncAsyncInfo(AsyncInfo); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!"); return OMP_TGT_SUCCESS; Index: openmp/libomptarget/src/omptarget.cpp =================================================================== --- openmp/libomptarget/src/omptarget.cpp +++ openmp/libomptarget/src/omptarget.cpp @@ -25,6 +25,7 @@ int AsyncInfoTy::synchronize() { int Result = OFFLOAD_SUCCESS; if (AsyncInfo.Queue) { + DP("Device synchronization\n"); // If we have a queue we need to synchronize it now. Result = Device.synchronize(*this); assert(AsyncInfo.Queue == nullptr && @@ -680,7 +681,7 @@ void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { - int Ret; + int Ret = OFFLOAD_SUCCESS; SmallVector PostProcessingPtrs; void *FromMapperBase = nullptr; // process each input. @@ -839,14 +840,6 @@ } } - // TODO: We should not synchronize here but pass the AsyncInfo object to the - // allocate/deallocate device APIs. - // - // We need to synchronize before deallocating data. - Ret = AsyncInfo.synchronize(); - if (Ret != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - // Deallocate target pointer for (PostProcessingInfo &Info : PostProcessingPtrs) { // If we marked the entry to be deleted we need to verify no other thread @@ -857,7 +850,6 @@ LookupResult LR; DeviceTy::HDTTMapAccessorTy HDTTMap = Device.HostDataToTargetMap.getExclusiveAccessor(!Info.DelEntry); - if (Info.DelEntry) { LR = Device.lookupMapping(HDTTMap, Info.HstPtrBegin, Info.DataSize); if (LR.Entry->getTotalRefCount() != 0 || @@ -895,12 +887,11 @@ }; applyToShadowMapEntries(Device, CB, Info.HstPtrBegin, Info.DataSize, Info.TPR); - // If we are deleting the entry the DataMapMtx is locked and we own the // entry. if (Info.DelEntry) { if (!FromMapperBase || FromMapperBase != Info.HstPtrBegin) - Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize); + Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Deallocating data from device failed.\n");