diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "device.h" +#include "omptarget.h" #include "private.h" #include "rtl.h" @@ -179,8 +180,11 @@ } #endif + __tgt_async_info AsyncInfo; int rc = targetDataBegin(loc, Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers, nullptr); + arg_types, arg_names, arg_mappers, AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = synchronizeAsyncInfo(Device, AsyncInfo); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); } @@ -262,8 +266,11 @@ } #endif + __tgt_async_info AsyncInfo; int rc = targetDataEnd(loc, Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers, nullptr); + arg_types, arg_names, arg_mappers, AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = synchronizeAsyncInfo(Device, AsyncInfo); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); } @@ -324,9 +331,12 @@ printKernelArguments(loc, device_id, arg_num, arg_sizes, arg_types, arg_names, "Updating OpenMP data"); + __tgt_async_info AsyncInfo; DeviceTy &Device = PM->Devices[device_id]; int rc = targetDataUpdate(loc, Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers); + arg_types, arg_names, arg_mappers, AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = synchronizeAsyncInfo(Device, AsyncInfo); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); } @@ -393,8 +403,13 @@ } #endif - int rc = target(loc, device_id, host_ptr, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers, 0, 0, false /*team*/); + DeviceTy &Device = PM->Devices[device_id]; + __tgt_async_info AsyncInfo; + int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes, + arg_types, arg_names, arg_mappers, 0, 0, false /*team*/, + AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = synchronizeAsyncInfo(Device, AsyncInfo); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); return rc; } @@ -467,9 +482,13 @@ } #endif - int rc = target(loc, device_id, host_ptr, arg_num, args_base, args, arg_sizes, + DeviceTy &Device = PM->Devices[device_id]; + __tgt_async_info AsyncInfo; + int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes, arg_types, arg_names, arg_mappers, team_num, thread_limit, - true /*team*/); + true /*team*/, AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = synchronizeAsyncInfo(Device, AsyncInfo); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); return rc; } diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "omptarget.h" #include "device.h" #include "private.h" #include "rtl.h" @@ -152,6 +153,7 @@ /* * Run ctors for static objects */ + __tgt_async_info AsyncInfo; if (!Device.PendingCtorsDtors.empty()) { // Call all ctors for all libraries registered so far for (auto &lib : Device.PendingCtorsDtors) { @@ -160,8 +162,8 @@ for (auto &entry : lib.second.PendingCtors) { void *ctor = entry; int rc = - target(nullptr, device_id, ctor, 0, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, 1, 1, true /*team*/); + target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo); if (rc != OFFLOAD_SUCCESS) { REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); Device.PendingGlobalsMtx.unlock(); @@ -174,10 +176,14 @@ } } } + + // All constructors have been issued, wait for them now. + rc = synchronizeAsyncInfo(Device, AsyncInfo); + Device.HasPendingGlobals = false; Device.PendingGlobalsMtx.unlock(); - return OFFLOAD_SUCCESS; + return rc; } // Check whether a device has been initialized, global ctors have been @@ -213,6 +219,7 @@ int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg, int64_t arg_size, int64_t arg_type, map_var_info_t arg_names, void *arg_mapper, + __tgt_async_info &AsyncInfo, TargetDataFuncPtrTy target_data_function) { TIMESCOPE_WITH_IDENT(loc); DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper)); @@ -247,7 +254,7 @@ MapperArgsBase.data(), MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(), MapperArgNames.data(), /*arg_mappers*/ nullptr, - /*__tgt_async_info*/ nullptr); + /*__tgt_async_info*/ AsyncInfo); return rc; } @@ -256,7 +263,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, - void **arg_mappers, __tgt_async_info *async_info_ptr) { + void **arg_mappers, __tgt_async_info &AsyncInfo) { // process each input. for (int32_t i = 0; i < arg_num; ++i) { // Ignore private variables and arrays - there is no mapping for them. @@ -271,9 +278,9 @@ DP("Calling targetDataMapper for the %dth argument\n", i); map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i]; - int rc = targetDataMapper(loc, Device, args_base[i], args[i], arg_sizes[i], - arg_types[i], arg_name, arg_mappers[i], - targetDataBegin); + int rc = targetDataMapper(loc, Device, args_base[i], args[i], + arg_sizes[i], arg_types[i], arg_name, + arg_mappers[i], AsyncInfo, targetDataBegin); if (rc != OFFLOAD_SUCCESS) { REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" @@ -402,8 +409,8 @@ if (copy && !IsHostPtr) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, - async_info_ptr); + int rt = + Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, &AsyncInfo); if (rt != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -417,7 +424,7 @@ uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, - sizeof(void *), async_info_ptr); + sizeof(void *), &AsyncInfo); if (rt != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -451,24 +458,28 @@ : HstPtrBegin(HstPtr), DataSize(Size), ForceDelete(ForceDelete), HasCloseModifier(HasCloseModifier) {} }; +} // namespace + +/// Synchronize \p AsyncInfo for for \p Device, if synchronization is possible. +int synchronizeAsyncInfo(DeviceTy &Device, __tgt_async_info &AsyncInfo) { + // If AsyncInfo.Queue is nullptr, we have no pending asynchronous operation so + // we don't need to synchronize. + if (!AsyncInfo.Queue) + return OFFLOAD_SUCCESS; -/// Synchronize device -static int syncDevice(DeviceTy &Device, __tgt_async_info *AsyncInfo) { - assert(AsyncInfo && AsyncInfo->Queue && "Invalid AsyncInfo"); - if (Device.synchronize(AsyncInfo) != OFFLOAD_SUCCESS) { + if (Device.synchronize(&AsyncInfo) != OFFLOAD_SUCCESS) { REPORT("Failed to synchronize device.\n"); return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } -} // namespace /// Internal function to undo the mapping and retrieve the data from the device. int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, - void **ArgMappers, __tgt_async_info *AsyncInfo) { + void **ArgMappers, __tgt_async_info &AsyncInfo) { int Ret; std::vector DeallocTgtPtrs; // process each input. @@ -486,9 +497,9 @@ DP("Calling targetDataMapper for the %dth argument\n", I); map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; - Ret = - targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I], - ArgTypes[I], ArgName, ArgMappers[I], targetDataEnd); + Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I], + ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, + targetDataEnd); if (Ret != OFFLOAD_SUCCESS) { REPORT("Call to targetDataEnd via targetDataMapper for custom mapper" @@ -583,7 +594,7 @@ DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, - AsyncInfo); + &AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data from device failed.\n"); return OFFLOAD_FAIL; @@ -636,16 +647,13 @@ } // We need to synchronize before deallocating data. - // If AsyncInfo is nullptr, the previous data transfer (if has) will be - // synchronous, so we don't need to synchronize again. If AsyncInfo->Queue is - // nullptr, there is no data transfer happened because once there is, - // AsyncInfo->Queue will not be nullptr, so again, we don't need to - // synchronize. - if (AsyncInfo && AsyncInfo->Queue) { - Ret = syncDevice(Device, AsyncInfo); - if (Ret != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - } + Ret = synchronizeAsyncInfo(Device, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + // TODO: We should not synchronized here but since we did we don't need + // to do it later. + AsyncInfo.Queue = nullptr; // Deallocate target pointer for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) { @@ -662,7 +670,7 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase, void *HstPtrBegin, int64_t ArgSize, - int64_t ArgType) { + int64_t ArgType, __tgt_async_info &AsyncInfo) { TIMESCOPE_WITH_IDENT(loc); bool IsLast, IsHostPtr; void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false, @@ -688,7 +696,8 @@ if (ArgType & OMP_TGT_MAPTYPE_FROM) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, nullptr); + int Ret = + Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, &AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data from device failed.\n"); return OFFLOAD_FAIL; @@ -715,7 +724,7 @@ if (ArgType & OMP_TGT_MAPTYPE_TO) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, nullptr); + int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, &AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -735,7 +744,7 @@ "pointer " DPxMOD "\n", DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr)); Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal, - sizeof(void *), nullptr); + sizeof(void *), &AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); Device.ShadowMtx.unlock(); @@ -751,8 +760,8 @@ void *ArgsBase, __tgt_target_non_contig *NonContig, uint64_t Size, int64_t ArgType, - int CurrentDim, int DimSize, - uint64_t Offset) { + int CurrentDim, int DimSize, uint64_t Offset, + __tgt_async_info &AsyncInfo) { TIMESCOPE_WITH_IDENT(loc); int Ret = OFFLOAD_SUCCESS; if (CurrentDim < DimSize) { @@ -764,7 +773,7 @@ if (CurrentDim != DimSize - 1 || I == 0) { Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size, ArgType, CurrentDim + 1, DimSize, - Offset + CurOffset); + Offset + CurOffset, AsyncInfo); // Stop the whole process if any contiguous piece returns anything // other than OFFLOAD_SUCCESS. if (Ret != OFFLOAD_SUCCESS) @@ -776,7 +785,8 @@ DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64 " len %" PRIu64 "\n", DPxPTR(Ptr), Offset, Size); - Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType); + Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType, + AsyncInfo); } return Ret; } @@ -792,12 +802,10 @@ } /// Internal function to pass data to/from the target. -// async_info_ptr is currently unused, added here so targetDataUpdate has the -// same signature as targetDataBegin and targetDataEnd. int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, - void **ArgMappers, __tgt_async_info *AsyncInfoPtr) { + void **ArgMappers, __tgt_async_info &AsyncInfo) { // process each input. for (int32_t I = 0; I < ArgNum; ++I) { if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || @@ -812,7 +820,7 @@ map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], - ArgTypes[I], ArgName, ArgMappers[I], + ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, targetDataUpdate); if (Ret != OFFLOAD_SUCCESS) { @@ -835,10 +843,10 @@ int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); Ret = targetDataNonContiguous( loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I], - /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0); + /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo); } else { Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], - ArgTypes[I]); + ArgTypes[I], AsyncInfo); } if (Ret == OFFLOAD_FAIL) return OFFLOAD_FAIL; @@ -1088,7 +1096,7 @@ std::vector &TgtArgs, std::vector &TgtOffsets, PrivateArgumentManagerTy &PrivateArgumentManager, - __tgt_async_info *AsyncInfo) { + __tgt_async_info &AsyncInfo) { TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc); DeviceTy &Device = PM->Devices[DeviceId]; int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes, @@ -1139,7 +1147,7 @@ DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, - sizeof(void *), AsyncInfo); + sizeof(void *), &AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -1209,7 +1217,7 @@ int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, PrivateArgumentManagerTy &PrivateArgumentManager, - __tgt_async_info *AsyncInfo) { + __tgt_async_info &AsyncInfo) { TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc); DeviceTy &Device = PM->Devices[DeviceId]; @@ -1238,11 +1246,11 @@ /// performs the same action as data_update and data_end above. This function /// returns 0 if it was able to transfer the execution to a target and an /// integer different from zero otherwise. -int target(ident_t *loc, int64_t DeviceId, void *HostPtr, int32_t ArgNum, +int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, - int32_t ThreadLimit, int IsTeamConstruct) { - DeviceTy &Device = PM->Devices[DeviceId]; + int32_t ThreadLimit, int IsTeamConstruct, + __tgt_async_info &AsyncInfo) { TableMap *TM = getTableMap(HostPtr); // No map for this host pointer found! @@ -1256,14 +1264,12 @@ __tgt_target_table *TargetTable = nullptr; { std::lock_guard TrlTblLock(PM->TrlTblMtx); - assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && + assert(TM->Table->TargetsTable.size() > (size_t)Device.DeviceID && "Not expecting a device ID outside the table's bounds!"); - TargetTable = TM->Table->TargetsTable[DeviceId]; + TargetTable = TM->Table->TargetsTable[Device.DeviceID]; } assert(TargetTable && "Global data has not been mapped\n"); - __tgt_async_info AsyncInfo; - std::vector TgtArgs; std::vector TgtOffsets; @@ -1272,9 +1278,10 @@ int Ret; if (ArgNum) { // Process data, such as data mapping, before launching the kernel - Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, - ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs, - TgtOffsets, PrivateArgumentManager, &AsyncInfo); + Ret = + processDataBefore(loc, Device.DeviceID, HostPtr, ArgNum, ArgBases, Args, + ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs, + TgtOffsets, PrivateArgumentManager, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Failed to process data before launching the kernel.\n"); return OFFLOAD_FAIL; @@ -1282,7 +1289,7 @@ } // Get loop trip count - uint64_t LoopTripCount = getLoopTripCount(DeviceId); + uint64_t LoopTripCount = getLoopTripCount(Device.DeviceID); // Launch device execution. void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; @@ -1309,18 +1316,13 @@ if (ArgNum) { // Transfer data back and deallocate target memory for (first-)private // variables - Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, - ArgSizes, ArgTypes, ArgNames, ArgMappers, - PrivateArgumentManager, &AsyncInfo); + Ret = processDataAfter(loc, Device.DeviceID, HostPtr, ArgNum, ArgBases, + Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, + PrivateArgumentManager, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Failed to process data after launching the kernel.\n"); return OFFLOAD_FAIL; } - } else if (AsyncInfo.Queue) { - // If ArgNum is zero, but AsyncInfo.Queue is valid, then the kernel doesn't - // hava any argument, and the device supports async operations, so we need a - // sync at this point. - return syncDevice(Device, &AsyncInfo); } return OFFLOAD_SUCCESS; diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -13,6 +13,7 @@ #ifndef _OMPTARGET_PRIVATE_H #define _OMPTARGET_PRIVATE_H +#include "rtl.h" #include #include #include @@ -22,25 +23,25 @@ extern int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, - void **arg_mappers, - __tgt_async_info *async_info_ptr); + void **arg_mappers, __tgt_async_info &AsyncInfo); extern int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *arg_names, - void **ArgMappers, __tgt_async_info *AsyncInfo); + void **ArgMappers, __tgt_async_info &AsyncInfo); extern int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, - void **arg_mappers, - __tgt_async_info *async_info_ptr = nullptr); + void **arg_mappers, __tgt_async_info &AsyncInfo); -extern int target(ident_t *loc, int64_t DeviceId, void *HostPtr, int32_t ArgNum, +extern int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *arg_names, void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit, - int IsTeamConstruct); + int IsTeamConstruct, __tgt_async_info &AsyncInfo); + +extern int synchronizeAsyncInfo(DeviceTy &Device, __tgt_async_info &AsyncInfo); extern int CheckDeviceAndCtors(int64_t device_id); @@ -78,7 +79,7 @@ typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **, void **, int64_t *, int64_t *, map_var_info_t *, void **, - __tgt_async_info *); + __tgt_async_info &); // Implemented in libomp, they are called from within __tgt_* functions. #ifdef __cplusplus diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -396,11 +396,12 @@ for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) { DeviceTy &Device = PM->Devices[FoundRTL->Idx + i]; Device.PendingGlobalsMtx.lock(); + __tgt_async_info AsyncInfo; if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) { for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) { - int rc = - target(nullptr, Device.DeviceID, dtor, 0, nullptr, nullptr, - nullptr, nullptr, nullptr, nullptr, 1, 1, true /*team*/); + int rc = target(nullptr, Device, dtor, 0, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, 1, 1, true /*team*/, + AsyncInfo); if (rc != OFFLOAD_SUCCESS) { DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor)); } @@ -408,6 +409,7 @@ // Remove this library's entry from PendingCtorsDtors Device.PendingCtorsDtors.erase(desc); } + synchronizeAsyncInfo(Device, AsyncInfo); Device.PendingGlobalsMtx.unlock(); }