diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "device.h" +#include "omptarget.h" #include "private.h" #include "rtl.h" @@ -171,11 +172,13 @@ } else if (src_device == omp_get_initial_device()) { DP("copy from host to device\n"); DeviceTy &DstDev = PM->Devices[dst_device]; - rc = DstDev.submitData(dstAddr, srcAddr, length, nullptr); + AsyncInfoTy AsyncInfo(DstDev); + rc = DstDev.submitData(dstAddr, srcAddr, length, AsyncInfo); } else if (dst_device == omp_get_initial_device()) { DP("copy from device to host\n"); DeviceTy &SrcDev = PM->Devices[src_device]; - rc = SrcDev.retrieveData(dstAddr, srcAddr, length, nullptr); + AsyncInfoTy AsyncInfo(SrcDev); + rc = SrcDev.retrieveData(dstAddr, srcAddr, length, AsyncInfo); } else { DP("copy from device to device\n"); DeviceTy &SrcDev = PM->Devices[src_device]; @@ -183,15 +186,21 @@ // First try to use D2D memcpy which is more efficient. If fails, fall back // to unefficient way. if (SrcDev.isDataExchangable(DstDev)) { - rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, nullptr); + AsyncInfoTy AsyncInfo(SrcDev); + rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, AsyncInfo); if (rc == OFFLOAD_SUCCESS) return OFFLOAD_SUCCESS; } void *buffer = malloc(length); - rc = SrcDev.retrieveData(buffer, srcAddr, length, nullptr); - if (rc == OFFLOAD_SUCCESS) - rc = DstDev.submitData(dstAddr, buffer, length, nullptr); + { + AsyncInfoTy AsyncInfo(SrcDev); + rc = SrcDev.retrieveData(buffer, srcAddr, length, AsyncInfo); + } + if (rc == OFFLOAD_SUCCESS) { + AsyncInfoTy AsyncInfo(SrcDev); + rc = DstDev.submitData(dstAddr, buffer, length, AsyncInfo); + } free(buffer); } diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -22,13 +22,13 @@ #include #include +#include "omptarget.h" #include "rtl.h" // Forward declarations. struct RTLInfoTy; struct __tgt_bin_desc; struct __tgt_target_table; -struct __tgt_async_info; using map_var_info_t = void *; @@ -200,24 +200,24 @@ // synchronous. // Copy data from host to device int32_t submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, - __tgt_async_info *AsyncInfoPtr); + AsyncInfoTy &AsyncInfo); // Copy data from device back to host int32_t retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, - __tgt_async_info *AsyncInfoPtr); + AsyncInfoTy &AsyncInfo); // Copy data from current device to destination device directly int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, - int64_t Size, __tgt_async_info *AsyncInfo); + int64_t Size, AsyncInfoTy &AsyncInfo); int32_t runRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, - int32_t TgtVarsSize, __tgt_async_info *AsyncInfoPtr); + int32_t TgtVarsSize, AsyncInfoTy &AsyncInfo); int32_t runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit, - uint64_t LoopTripCount, __tgt_async_info *AsyncInfoPtr); + uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo); /// Synchronize device/queue/event based on \p AsyncInfoPtr and return /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. - int32_t synchronize(__tgt_async_info *AsyncInfoPtr); + int32_t synchronize(AsyncInfoTy &AsyncInfo); private: // Call to RTL diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -415,27 +415,27 @@ // Submit data to device int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, - __tgt_async_info *AsyncInfoPtr) { - if (!AsyncInfoPtr || !RTL->data_submit_async || !RTL->synchronize) + AsyncInfoTy &AsyncInfo) { + if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize) return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); else return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size, - AsyncInfoPtr); + AsyncInfo); } // Retrieve data from device int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin, - int64_t Size, __tgt_async_info *AsyncInfoPtr) { - if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize) + int64_t Size, AsyncInfoTy &AsyncInfo) { + if (!RTL->data_retrieve_async || !RTL->synchronize) return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); else return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size, - AsyncInfoPtr); + AsyncInfo); } // Copy data from current device to destination device directly int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, - int64_t Size, __tgt_async_info *AsyncInfo) { + int64_t Size, AsyncInfoTy &AsyncInfo) { if (!AsyncInfo || !RTL->data_exchange_async || !RTL->synchronize) { assert(RTL->data_exchange && "RTL->data_exchange is nullptr"); return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, @@ -448,13 +448,13 @@ // Run region on device int32_t DeviceTy::runRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, - __tgt_async_info *AsyncInfoPtr) { - if (!AsyncInfoPtr || !RTL->run_region || !RTL->synchronize) + AsyncInfoTy &AsyncInfo) { + if (!RTL->run_region || !RTL->synchronize) return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, TgtVarsSize); else return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, - TgtOffsets, TgtVarsSize, AsyncInfoPtr); + TgtOffsets, TgtVarsSize, AsyncInfo); } // Run team region on device. @@ -462,15 +462,15 @@ ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit, uint64_t LoopTripCount, - __tgt_async_info *AsyncInfoPtr) { - if (!AsyncInfoPtr || !RTL->run_team_region_async || !RTL->synchronize) + AsyncInfoTy &AsyncInfo) { + if (!RTL->run_team_region_async || !RTL->synchronize) return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount); else return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, TgtVarsSize, NumTeams, - ThreadLimit, LoopTripCount, AsyncInfoPtr); + ThreadLimit, LoopTripCount, AsyncInfo); } // Whether data can be copied to DstDevice directly @@ -485,9 +485,9 @@ return false; } -int32_t DeviceTy::synchronize(__tgt_async_info *AsyncInfoPtr) { +int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) { if (RTL->synchronize) - return RTL->synchronize(RTLDeviceID, AsyncInfoPtr); + return RTL->synchronize(RTLDeviceID, AsyncInfo); return OFFLOAD_SUCCESS; } diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "device.h" +#include "omptarget.h" #include "private.h" #include "rtl.h" @@ -183,8 +184,11 @@ } #endif + AsyncInfoTy AsyncInfo(Device); int rc = targetDataBegin(loc, Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers, nullptr); + arg_types, arg_names, arg_mappers, AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = AsyncInfo.synchronize(); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); } @@ -270,8 +274,11 @@ } #endif + AsyncInfoTy AsyncInfo(Device); int rc = targetDataEnd(loc, Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers, nullptr); + arg_types, arg_names, arg_mappers, AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = AsyncInfo.synchronize(); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); } @@ -335,8 +342,11 @@ arg_names, "Updating OpenMP data"); DeviceTy &Device = PM->Devices[device_id]; + AsyncInfoTy AsyncInfo(Device); int rc = targetDataUpdate(loc, Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers, nullptr); + arg_types, arg_names, arg_mappers, AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = AsyncInfo.synchronize(); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); } @@ -408,9 +418,12 @@ #endif DeviceTy &Device = PM->Devices[device_id]; - int rc = - target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers, 0, 0, false /*team*/, nullptr); + AsyncInfoTy AsyncInfo(Device); + int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes, + arg_types, arg_names, arg_mappers, 0, 0, false /*team*/, + AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = AsyncInfo.synchronize(); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); return rc; } @@ -490,9 +503,12 @@ #endif DeviceTy &Device = PM->Devices[device_id]; + AsyncInfoTy AsyncInfo(Device); int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes, arg_types, arg_names, arg_mappers, team_num, thread_limit, - true /*team*/, nullptr); + true /*team*/, AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = AsyncInfo.synchronize(); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); return rc; } diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -23,7 +23,7 @@ int Result = OFFLOAD_SUCCESS; if (AsyncInfo.Queue) { // If we have a queue we need to synchronize it now. - Result = Device.synchronize(&AsyncInfo); + Result = Device.synchronize(*this); assert(AsyncInfo.Queue == nullptr && "The device plugin should have nulled the queue to indicate there " "are no outstanding actions!"); @@ -166,6 +166,7 @@ * Run ctors for static objects */ if (!Device.PendingCtorsDtors.empty()) { + AsyncInfoTy AsyncInfo(Device); // Call all ctors for all libraries registered so far for (auto &lib : Device.PendingCtorsDtors) { if (!lib.second.PendingCtors.empty()) { @@ -174,7 +175,7 @@ void *ctor = entry; int rc = target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, 1, 1, true /*team*/, nullptr); + nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo); if (rc != OFFLOAD_SUCCESS) { REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); Device.PendingGlobalsMtx.unlock(); @@ -186,6 +187,9 @@ DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); } } + // All constructors have been issued, wait for them now. + if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; } Device.HasPendingGlobals = false; Device.PendingGlobalsMtx.unlock(); @@ -226,6 +230,7 @@ int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg, int64_t arg_size, int64_t arg_type, map_var_info_t arg_names, void *arg_mapper, + AsyncInfoTy &AsyncInfo, TargetDataFuncPtrTy target_data_function) { TIMESCOPE_WITH_IDENT(loc); DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper)); @@ -256,11 +261,10 @@ MapperArgNames[I] = C.Name; } - int rc = target_data_function(loc, Device, MapperComponents.Components.size(), - MapperArgsBase.data(), MapperArgs.data(), - MapperArgSizes.data(), MapperArgTypes.data(), - MapperArgNames.data(), /*arg_mappers*/ nullptr, - /* AsyncInfoTy */ nullptr); + int rc = target_data_function( + loc, Device, MapperComponents.Components.size(), MapperArgsBase.data(), + MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(), + MapperArgNames.data(), /*arg_mappers*/ nullptr, AsyncInfo); return rc; } @@ -269,7 +273,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, - void **arg_mappers, AsyncInfoTy *AsyncInfo) { + void **arg_mappers, AsyncInfoTy &AsyncInfo) { // process each input. for (int32_t i = 0; i < arg_num; ++i) { // Ignore private variables and arrays - there is no mapping for them. @@ -286,7 +290,7 @@ map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i]; int rc = targetDataMapper(loc, Device, args_base[i], args[i], arg_sizes[i], arg_types[i], arg_name, - arg_mappers[i], targetDataBegin); + arg_mappers[i], AsyncInfo, targetDataBegin); if (rc != OFFLOAD_SUCCESS) { REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" @@ -416,7 +420,7 @@ DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); int rt = - Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, *AsyncInfo); + Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, AsyncInfo); if (rt != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -430,7 +434,7 @@ uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, - sizeof(void *), *AsyncInfo); + sizeof(void *), AsyncInfo); if (rt != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -470,7 +474,7 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, - void **ArgMappers, AsyncInfoTy *AsyncInfo) { + void **ArgMappers, AsyncInfoTy &AsyncInfo) { int Ret; std::vector DeallocTgtPtrs; // process each input. @@ -488,9 +492,9 @@ DP("Calling targetDataMapper for the %dth argument\n", I); map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; - Ret = - targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I], - ArgTypes[I], ArgName, ArgMappers[I], targetDataEnd); + Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I], + ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, + targetDataEnd); if (Ret != OFFLOAD_SUCCESS) { REPORT("Call to targetDataEnd via targetDataMapper for custom mapper" @@ -585,7 +589,7 @@ DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, - *AsyncInfo); + AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data from device failed.\n"); return OFFLOAD_FAIL; @@ -637,17 +641,13 @@ } } + // TODO: We should not synchronize here but pass the AsyncInfo object to the + // allocate/deallocate device APIs. + // // We need to synchronize before deallocating data. - // If AsyncInfo is nullptr, the previous data transfer (if has) will be - // synchronous, so we don't need to synchronize again. If AsyncInfo->Queue is - // nullptr, there is no data transfer happened because once there is, - // AsyncInfo->Queue will not be nullptr, so again, we don't need to - // synchronize. - if (AsyncInfo) { - Ret = AsyncInfo->synchronize(); - if (Ret != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - } + Ret = AsyncInfo.synchronize(); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; // Deallocate target pointer for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) { @@ -664,7 +664,7 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase, void *HstPtrBegin, int64_t ArgSize, - int64_t ArgType) { + int64_t ArgType, AsyncInfoTy &AsyncInfo) { TIMESCOPE_WITH_IDENT(loc); bool IsLast, IsHostPtr; void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false, @@ -690,7 +690,7 @@ if (ArgType & OMP_TGT_MAPTYPE_FROM) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, nullptr); + int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data from device failed.\n"); return OFFLOAD_FAIL; @@ -717,7 +717,7 @@ if (ArgType & OMP_TGT_MAPTYPE_TO) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, nullptr); + int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -737,7 +737,7 @@ "pointer " DPxMOD "\n", DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr)); Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal, - sizeof(void *), nullptr); + sizeof(void *), AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); Device.ShadowMtx.unlock(); @@ -753,8 +753,8 @@ void *ArgsBase, __tgt_target_non_contig *NonContig, uint64_t Size, int64_t ArgType, - int CurrentDim, int DimSize, - uint64_t Offset) { + int CurrentDim, int DimSize, uint64_t Offset, + AsyncInfoTy &AsyncInfo) { TIMESCOPE_WITH_IDENT(loc); int Ret = OFFLOAD_SUCCESS; if (CurrentDim < DimSize) { @@ -766,7 +766,7 @@ if (CurrentDim != DimSize - 1 || I == 0) { Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size, ArgType, CurrentDim + 1, DimSize, - Offset + CurOffset); + Offset + CurOffset, AsyncInfo); // Stop the whole process if any contiguous piece returns anything // other than OFFLOAD_SUCCESS. if (Ret != OFFLOAD_SUCCESS) @@ -778,7 +778,8 @@ DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64 " len %" PRIu64 "\n", DPxPTR(Ptr), Offset, Size); - Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType); + Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType, + AsyncInfo); } return Ret; } @@ -794,12 +795,10 @@ } /// Internal function to pass data to/from the target. -// AsyncInfo is currently unused, added here so targetDataUpdate has the -// same signature as targetDataBegin and targetDataEnd. int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, - void **ArgMappers, AsyncInfoTy *AsyncInfo) { + void **ArgMappers, AsyncInfoTy &AsyncInfo) { // process each input. for (int32_t I = 0; I < ArgNum; ++I) { if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || @@ -814,7 +813,7 @@ map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], - ArgTypes[I], ArgName, ArgMappers[I], + ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, targetDataUpdate); if (Ret != OFFLOAD_SUCCESS) { @@ -837,10 +836,10 @@ int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); Ret = targetDataNonContiguous( loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I], - /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0); + /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo); } else { Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], - ArgTypes[I]); + ArgTypes[I], AsyncInfo); } if (Ret == OFFLOAD_FAIL) return OFFLOAD_FAIL; @@ -950,7 +949,7 @@ /// A reference to the \p DeviceTy object DeviceTy &Device; /// A pointer to a \p AsyncInfoTy object - AsyncInfoTy *AsyncInfo; + AsyncInfoTy &AsyncInfo; // TODO: What would be the best value here? Should we make it configurable? // If the size is larger than this threshold, we will allocate and transfer it @@ -959,7 +958,7 @@ public: /// Constructor - PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy *AsyncInfo) + PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo) : Device(Dev), AsyncInfo(AsyncInfo) {} /// Add a private argument @@ -986,7 +985,7 @@ #endif // If first-private, copy data from host if (IsFirstPrivate) { - int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, *AsyncInfo); + int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { DP("Copying data to device failed, failed.\n"); return OFFLOAD_FAIL; @@ -1042,7 +1041,7 @@ FirstPrivateArgSize, DPxPTR(TgtPtr)); // Transfer data to target device int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(), - FirstPrivateArgSize, *AsyncInfo); + FirstPrivateArgSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { DP("Failed to submit data of private arguments.\n"); return OFFLOAD_FAIL; @@ -1090,7 +1089,7 @@ std::vector &TgtArgs, std::vector &TgtOffsets, PrivateArgumentManagerTy &PrivateArgumentManager, - AsyncInfoTy *AsyncInfo) { + AsyncInfoTy &AsyncInfo) { TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc); DeviceTy &Device = PM->Devices[DeviceId]; int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes, @@ -1141,7 +1140,7 @@ DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, - sizeof(void *), *AsyncInfo); + sizeof(void *), AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -1211,7 +1210,7 @@ int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, PrivateArgumentManagerTy &PrivateArgumentManager, - AsyncInfoTy *AsyncInfo) { + AsyncInfoTy &AsyncInfo) { TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc); DeviceTy &Device = PM->Devices[DeviceId]; @@ -1243,7 +1242,7 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, - int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy *AsyncInfo) { + int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) { int32_t DeviceId = Device.DeviceID; TableMap *TM = getTableMap(HostPtr); @@ -1264,12 +1263,6 @@ } assert(TargetTable && "Global data has not been mapped\n"); - // TODO: This will go away as soon as we consequently pass in async info - // objects (as references). - AsyncInfoTy InternalAsyncInfo(Device); - if (!AsyncInfo) - AsyncInfo = &InternalAsyncInfo; - std::vector TgtArgs; std::vector TgtOffsets; @@ -1301,10 +1294,10 @@ if (IsTeamConstruct) Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], TgtArgs.size(), TeamNum, ThreadLimit, - LoopTripCount, *AsyncInfo); + LoopTripCount, AsyncInfo); else Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], - TgtArgs.size(), *AsyncInfo); + TgtArgs.size(), AsyncInfo); } if (Ret != OFFLOAD_SUCCESS) { @@ -1322,13 +1315,6 @@ REPORT("Failed to process data after launching the kernel.\n"); return OFFLOAD_FAIL; } - } else { - // TODO: We should not synchronize here but on the outer level once we pass - // in a reference AsyncInfo object. - // If ArgNum is zero, but AsyncInfo.Queue is valid, then the kernel doesn't - // hava any argument, and the device supports async operations, so we need a - // sync at this point. - return AsyncInfo->synchronize(); } return OFFLOAD_SUCCESS; diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -23,23 +23,23 @@ extern int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, - void **arg_mappers, AsyncInfoTy *AsyncInfo); + void **arg_mappers, AsyncInfoTy &AsyncInfo); extern int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *arg_names, - void **ArgMappers, AsyncInfoTy *AsyncInfo); + void **ArgMappers, AsyncInfoTy &AsyncInfo); extern int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, - void **arg_mappers, AsyncInfoTy *AsyncInfo); + void **arg_mappers, AsyncInfoTy &AsyncInfo); extern int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *arg_names, void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit, - int IsTeamConstruct, AsyncInfoTy *AsyncInfo); + int IsTeamConstruct, AsyncInfoTy &AsyncInfo); extern int CheckDeviceAndCtors(int64_t device_id); @@ -76,7 +76,7 @@ // targetDataEnd and targetDataUpdate). typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **, void **, int64_t *, int64_t *, - map_var_info_t *, void **, AsyncInfoTy *); + map_var_info_t *, void **, AsyncInfoTy &); // Implemented in libomp, they are called from within __tgt_* functions. #ifdef __cplusplus diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -400,16 +400,20 @@ DeviceTy &Device = PM->Devices[FoundRTL->Idx + i]; Device.PendingGlobalsMtx.lock(); if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) { + AsyncInfoTy AsyncInfo(Device); for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) { - int rc = - target(nullptr, Device, dtor, 0, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, 1, 1, true /*team*/, nullptr); + int rc = target(nullptr, Device, dtor, 0, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, 1, 1, true /*team*/, + AsyncInfo); if (rc != OFFLOAD_SUCCESS) { DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor)); } } // Remove this library's entry from PendingCtorsDtors Device.PendingCtorsDtors.erase(desc); + // All constructors have been issued, wait for them now. + if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) + DP("Failed synchronizing destructors kernels.\n"); } Device.PendingGlobalsMtx.unlock(); }