diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "device.h" +#include "omptarget.h" #include "private.h" #include "rtl.h" @@ -183,8 +184,11 @@ } #endif + AsyncInfoTy AsyncInfo(Device); int rc = targetDataBegin(loc, Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers, nullptr); + arg_types, arg_names, arg_mappers, AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = AsyncInfo.synchronize(); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); } @@ -270,8 +274,11 @@ } #endif + AsyncInfoTy AsyncInfo(Device); int rc = targetDataEnd(loc, Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers, nullptr); + arg_types, arg_names, arg_mappers, AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = AsyncInfo.synchronize(); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); } @@ -335,8 +342,11 @@ arg_names, "Updating OpenMP data"); DeviceTy &Device = PM->Devices[device_id]; + AsyncInfoTy AsyncInfo(Device); int rc = targetDataUpdate(loc, Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers, nullptr); + arg_types, arg_names, arg_mappers, AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = AsyncInfo.synchronize(); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); } @@ -408,9 +418,12 @@ #endif DeviceTy &Device = PM->Devices[device_id]; - int rc = - target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers, 0, 0, false /*team*/, nullptr); + AsyncInfoTy AsyncInfo(Device); + int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes, + arg_types, arg_names, arg_mappers, 0, 0, false /*team*/, + AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = AsyncInfo.synchronize(); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); return rc; } @@ -490,9 +503,12 @@ #endif DeviceTy &Device = PM->Devices[device_id]; + AsyncInfoTy AsyncInfo(Device); int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes, arg_types, arg_names, arg_mappers, team_num, thread_limit, - true /*team*/, nullptr); + true /*team*/, AsyncInfo); + if (rc == OFFLOAD_SUCCESS) + rc = AsyncInfo.synchronize(); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); return rc; } diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -166,6 +166,7 @@ * Run ctors for static objects */ if (!Device.PendingCtorsDtors.empty()) { + AsyncInfoTy AsyncInfo(Device); // Call all ctors for all libraries registered so far for (auto &lib : Device.PendingCtorsDtors) { if (!lib.second.PendingCtors.empty()) { @@ -174,7 +175,7 @@ void *ctor = entry; int rc = target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, 1, 1, true /*team*/, nullptr); + nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo); if (rc != OFFLOAD_SUCCESS) { REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); Device.PendingGlobalsMtx.unlock(); @@ -186,6 +187,9 @@ DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); } } + // All constructors have been issued, wait for them now. + if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; } Device.HasPendingGlobals = false; Device.PendingGlobalsMtx.unlock(); @@ -226,6 +230,7 @@ int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg, int64_t arg_size, int64_t arg_type, map_var_info_t arg_names, void *arg_mapper, + AsyncInfoTy &AsyncInfo, TargetDataFuncPtrTy target_data_function) { TIMESCOPE_WITH_IDENT(loc); DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper)); @@ -256,11 +261,10 @@ MapperArgNames[I] = C.Name; } - int rc = target_data_function(loc, Device, MapperComponents.Components.size(), - MapperArgsBase.data(), MapperArgs.data(), - MapperArgSizes.data(), MapperArgTypes.data(), - MapperArgNames.data(), /*arg_mappers*/ nullptr, - /* AsyncInfoTy */ nullptr); + int rc = target_data_function( + loc, Device, MapperComponents.Components.size(), MapperArgsBase.data(), + MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(), + MapperArgNames.data(), /*arg_mappers*/ nullptr, AsyncInfo); return rc; } @@ -269,7 +273,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, - void **arg_mappers, AsyncInfoTy *AsyncInfo) { + void **arg_mappers, AsyncInfoTy &AsyncInfo) { // process each input. for (int32_t i = 0; i < arg_num; ++i) { // Ignore private variables and arrays - there is no mapping for them. @@ -286,7 +290,7 @@ map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i]; int rc = targetDataMapper(loc, Device, args_base[i], args[i], arg_sizes[i], arg_types[i], arg_name, - arg_mappers[i], targetDataBegin); + arg_mappers[i], AsyncInfo, targetDataBegin); if (rc != OFFLOAD_SUCCESS) { REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" @@ -416,7 +420,7 @@ DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); int rt = - Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, *AsyncInfo); + Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, AsyncInfo); if (rt != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -430,7 +434,7 @@ uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, - sizeof(void *), *AsyncInfo); + sizeof(void *), AsyncInfo); if (rt != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -470,7 +474,7 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, - void **ArgMappers, AsyncInfoTy *AsyncInfo) { + void **ArgMappers, AsyncInfoTy &AsyncInfo) { int Ret; std::vector DeallocTgtPtrs; // process each input. @@ -488,9 +492,9 @@ DP("Calling targetDataMapper for the %dth argument\n", I); map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; - Ret = - targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I], - ArgTypes[I], ArgName, ArgMappers[I], targetDataEnd); + Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I], + ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, + targetDataEnd); if (Ret != OFFLOAD_SUCCESS) { REPORT("Call to targetDataEnd via targetDataMapper for custom mapper" @@ -585,7 +589,7 @@ DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, - *AsyncInfo); + AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data from device failed.\n"); return OFFLOAD_FAIL; @@ -637,17 +641,16 @@ } } - // We need to synchronize before deallocating data. - // If AsyncInfo is nullptr, the previous data transfer (if has) will be + // TODO: We should not synchronized here but pass the AsyncInfo object to the + // allocate/deallocate device APIs. We need to synchronize before deallocating + // data. If AsyncInfo is nullptr, the previous data transfer (if has) will be // synchronous, so we don't need to synchronize again. If AsyncInfo->Queue is // nullptr, there is no data transfer happened because once there is, // AsyncInfo->Queue will not be nullptr, so again, we don't need to // synchronize. - if (AsyncInfo) { - Ret = AsyncInfo->synchronize(); - if (Ret != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - } + Ret = AsyncInfo.synchronize(); + if (Ret != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; // Deallocate target pointer for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) { @@ -794,12 +797,10 @@ } /// Internal function to pass data to/from the target. -// AsyncInfo is currently unused, added here so targetDataUpdate has the -// same signature as targetDataBegin and targetDataEnd. int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, - void **ArgMappers, AsyncInfoTy *AsyncInfo) { + void **ArgMappers, AsyncInfoTy &AsyncInfo) { // process each input. for (int32_t I = 0; I < ArgNum; ++I) { if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || @@ -814,7 +815,7 @@ map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], - ArgTypes[I], ArgName, ArgMappers[I], + ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, targetDataUpdate); if (Ret != OFFLOAD_SUCCESS) { @@ -950,7 +951,7 @@ /// A reference to the \p DeviceTy object DeviceTy &Device; /// A pointer to a \p AsyncInfoTy object - AsyncInfoTy *AsyncInfo; + AsyncInfoTy &AsyncInfo; // TODO: What would be the best value here? Should we make it configurable? // If the size is larger than this threshold, we will allocate and transfer it @@ -959,7 +960,7 @@ public: /// Constructor - PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy *AsyncInfo) + PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo) : Device(Dev), AsyncInfo(AsyncInfo) {} /// Add a private argument @@ -986,7 +987,7 @@ #endif // If first-private, copy data from host if (IsFirstPrivate) { - int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, *AsyncInfo); + int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { DP("Copying data to device failed, failed.\n"); return OFFLOAD_FAIL; @@ -1042,7 +1043,7 @@ FirstPrivateArgSize, DPxPTR(TgtPtr)); // Transfer data to target device int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(), - FirstPrivateArgSize, *AsyncInfo); + FirstPrivateArgSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { DP("Failed to submit data of private arguments.\n"); return OFFLOAD_FAIL; @@ -1090,7 +1091,7 @@ std::vector &TgtArgs, std::vector &TgtOffsets, PrivateArgumentManagerTy &PrivateArgumentManager, - AsyncInfoTy *AsyncInfo) { + AsyncInfoTy &AsyncInfo) { TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc); DeviceTy &Device = PM->Devices[DeviceId]; int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes, @@ -1141,7 +1142,7 @@ DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, - sizeof(void *), *AsyncInfo); + sizeof(void *), AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Copying data to device failed.\n"); return OFFLOAD_FAIL; @@ -1211,7 +1212,7 @@ int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, PrivateArgumentManagerTy &PrivateArgumentManager, - AsyncInfoTy *AsyncInfo) { + AsyncInfoTy &AsyncInfo) { TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc); DeviceTy &Device = PM->Devices[DeviceId]; @@ -1243,7 +1244,7 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, - int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy *AsyncInfo) { + int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) { int32_t DeviceId = Device.DeviceID; TableMap *TM = getTableMap(HostPtr); @@ -1264,12 +1265,6 @@ } assert(TargetTable && "Global data has not been mapped\n"); - // TODO: This will go away as soon as we consequently pass in async info - // objects (as references). - AsyncInfoTy InternalAsyncInfo(Device); - if (!AsyncInfo) - AsyncInfo = &InternalAsyncInfo; - std::vector TgtArgs; std::vector TgtOffsets; @@ -1301,10 +1296,10 @@ if (IsTeamConstruct) Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], TgtArgs.size(), TeamNum, ThreadLimit, - LoopTripCount, *AsyncInfo); + LoopTripCount, AsyncInfo); else Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], - TgtArgs.size(), *AsyncInfo); + TgtArgs.size(), AsyncInfo); } if (Ret != OFFLOAD_SUCCESS) { @@ -1328,7 +1323,7 @@ // If ArgNum is zero, but AsyncInfo.Queue is valid, then the kernel doesn't // hava any argument, and the device supports async operations, so we need a // sync at this point. - return AsyncInfo->synchronize(); + return AsyncInfo.synchronize(); } return OFFLOAD_SUCCESS; diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -23,23 +23,23 @@ extern int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, - void **arg_mappers, AsyncInfoTy *AsyncInfo); + void **arg_mappers, AsyncInfoTy &AsyncInfo); extern int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *arg_names, - void **ArgMappers, AsyncInfoTy *AsyncInfo); + void **ArgMappers, AsyncInfoTy &AsyncInfo); extern int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, - void **arg_mappers, AsyncInfoTy *AsyncInfo); + void **arg_mappers, AsyncInfoTy &AsyncInfo); extern int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *arg_names, void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit, - int IsTeamConstruct, AsyncInfoTy *AsyncInfo); + int IsTeamConstruct, AsyncInfoTy &AsyncInfo); extern int CheckDeviceAndCtors(int64_t device_id); @@ -76,7 +76,7 @@ // targetDataEnd and targetDataUpdate). typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **, void **, int64_t *, int64_t *, - map_var_info_t *, void **, AsyncInfoTy *); + map_var_info_t *, void **, AsyncInfoTy &); // Implemented in libomp, they are called from within __tgt_* functions. #ifdef __cplusplus diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -400,16 +400,20 @@ DeviceTy &Device = PM->Devices[FoundRTL->Idx + i]; Device.PendingGlobalsMtx.lock(); if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) { + AsyncInfoTy AsyncInfo(Device); for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) { - int rc = - target(nullptr, Device, dtor, 0, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, 1, 1, true /*team*/, nullptr); + int rc = target(nullptr, Device, dtor, 0, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, 1, 1, true /*team*/, + AsyncInfo); if (rc != OFFLOAD_SUCCESS) { DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor)); } } // Remove this library's entry from PendingCtorsDtors Device.PendingCtorsDtors.erase(desc); + // All constructors have been issued, wait for them now. + if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) + DP("Failed synchronizing destructors kernels.\n"); } Device.PendingGlobalsMtx.unlock(); }