diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -117,7 +117,8 @@ int64_t *ArgSizes; // Size of the argument data in bytes. int64_t *ArgTypes; // Type of the data (e.g. to / from). void **ArgNames; // Name of the data for debugging, possibly null. - void **ArgMappers; // User-defined mappers, possible null. + void **ArgMappers; // User-defined mappers, possibly null. + int64_t Tripcount; // Tripcount for the teams / distribute loop, 0 otherwise. }; /// This struct is a record of an entry point or global. For a function diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -280,7 +280,7 @@ map_var_info_t *arg_names, void **arg_mappers) { TIMESCOPE_WITH_IDENT(loc); __tgt_kernel_arguments Args{arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers}; + arg_types, arg_names, arg_mappers}; return __tgt_target_kernel(loc, device_id, -1, -1, host_ptr, &Args); } @@ -329,7 +329,7 @@ TIMESCOPE_WITH_IDENT(loc); __tgt_kernel_arguments Args{arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers}; + arg_types, arg_names, arg_mappers}; return __tgt_target_kernel(loc, device_id, team_num, thread_limit, host_ptr, &Args); } @@ -380,11 +380,19 @@ ThreadLimit = 0; } + if (Args->Tripcount != 0) { + PM->TblMapMtx.lock(); + PM->Devices[DeviceId]->LoopTripCnt.emplace(__kmpc_global_thread_num(Loc), + Args->Tripcount); + PM->TblMapMtx.unlock(); + } + DeviceTy &Device = *PM->Devices[DeviceId]; AsyncInfoTy AsyncInfo(Device); int rc = target(Loc, Device, HostPtr, Args->NumArgs, Args->ArgBasePtrs, Args->ArgPtrs, Args->ArgSizes, Args->ArgTypes, Args->ArgNames, - Args->ArgMappers, NumTeams, ThreadLimit, IsTeams, AsyncInfo); + Args->ArgMappers, NumTeams, ThreadLimit, Args->Tripcount, + IsTeams, AsyncInfo); if (rc == OFFLOAD_SUCCESS) rc = AsyncInfo.synchronize(); handleTargetOutcome(rc == OFFLOAD_SUCCESS, Loc); diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -190,9 +190,9 @@ DP("Has pending ctors... call now\n"); for (auto &entry : lib.second.PendingCtors) { void *ctor = entry; - int rc = - target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo); + int rc = target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, 1, 1, 0, true /*team*/, + AsyncInfo); if (rc != OFFLOAD_SUCCESS) { REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); return OFFLOAD_FAIL; @@ -1486,10 +1486,11 @@ /// performs the same action as data_update and data_end above. This function /// returns 0 if it was able to transfer the execution to a target and an /// integer different from zero otherwise. -int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, +int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, - int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) { + int32_t ThreadLimit, uint64_t Tripcount, int IsTeamConstruct, + AsyncInfoTy &AsyncInfo) { int32_t DeviceId = Device.DeviceID; TableMap *TM = getTableMap(HostPtr); @@ -1510,6 +1511,12 @@ } assert(TargetTable && "Global data has not been mapped\n"); + // Set tripcount. + { + std::lock_guard TblMapMtx(PM->TrlTblMtx); + Device.LoopTripCnt.emplace(__kmpc_global_thread_num(Loc), Tripcount); + } + // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we // need to manifest base pointers prior to launching a kernel. Even if we have // mapped an object only partially, e.g. A[N:M], although the kernel is @@ -1527,7 +1534,7 @@ int Ret; if (ArgNum) { // Process data, such as data mapping, before launching the kernel - Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, + Ret = processDataBefore(Loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs, TgtOffsets, PrivateArgumentManager, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { @@ -1543,7 +1550,7 @@ { TIMESCOPE_WITH_NAME_AND_IDENT( - IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc); + IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", Loc); if (IsTeamConstruct) Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], TgtArgs.size(), TeamNum, ThreadLimit, @@ -1561,7 +1568,7 @@ if (ArgNum) { // Transfer data back and deallocate target memory for (first-)private // variables - Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, + Ret = processDataAfter(Loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, PrivateArgumentManager, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -38,11 +38,12 @@ void **arg_mappers, AsyncInfoTy &AsyncInfo, bool FromMapper = false); -extern int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, +extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *arg_names, void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit, - int IsTeamConstruct, AsyncInfoTy &AsyncInfo); + uint64_t Tripcount, int IsTeamConstruct, + AsyncInfoTy &AsyncInfo); extern void handleTargetOutcome(bool Success, ident_t *Loc); extern bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc); diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -447,7 +447,7 @@ AsyncInfoTy AsyncInfo(Device); for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) { int rc = target(nullptr, Device, dtor, 0, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, 1, 1, true /*team*/, + nullptr, nullptr, nullptr, 1, 1, 0, true /*team*/, AsyncInfo); if (rc != OFFLOAD_SUCCESS) { DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor));