diff --git a/openmp/libomptarget/include/SourceInfo.h b/openmp/libomptarget/include/SourceInfo.h --- a/openmp/libomptarget/include/SourceInfo.h +++ b/openmp/libomptarget/include/SourceInfo.h @@ -91,6 +91,7 @@ const char *getName() const { return Name.c_str(); } const char *getFilename() const { return Filename.c_str(); } + const char *getProfileLocation() const { return SourceStr.data(); } int32_t getLine() const { return Line; } int32_t getColumn() const { return Column; } bool isAvailible() const { return (Line || Column); } diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -132,7 +132,7 @@ int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers) { - TIMESCOPE(); + TIMESCOPE_WITH_IDENT(loc); if (IsOffloadDisabled()) return; DP("Entering data begin region for device %" PRId64 " with %d mappings\n", @@ -164,7 +164,7 @@ } #endif - int rc = targetDataBegin(Device, arg_num, args_base, args, arg_sizes, + int rc = targetDataBegin(loc, Device, arg_num, args_base, args, arg_sizes, arg_types, arg_names, arg_mappers, nullptr); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); } @@ -174,7 +174,7 @@ void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { - TIMESCOPE(); + TIMESCOPE_WITH_IDENT(loc); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc)); @@ -210,7 +210,7 @@ int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers) { - TIMESCOPE(); + TIMESCOPE_WITH_IDENT(loc); if (IsOffloadDisabled()) return; DP("Entering data end region with %d mappings\n", arg_num); @@ -247,8 +247,8 @@ } #endif - int rc = targetDataEnd(Device, arg_num, args_base, args, arg_sizes, arg_types, - arg_names, arg_mappers, nullptr); + int rc = targetDataEnd(loc, Device, arg_num, args_base, args, arg_sizes, + arg_types, arg_names, arg_mappers, nullptr); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); } @@ -257,7 +257,7 @@ void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { - TIMESCOPE(); + TIMESCOPE_WITH_IDENT(loc); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc)); @@ -290,7 +290,7 @@ int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers) { - TIMESCOPE(); + TIMESCOPE_WITH_IDENT(loc); if (IsOffloadDisabled()) return; DP("Entering data update with %d mappings\n", arg_num); @@ -310,7 +310,7 @@ arg_names, "Updating OpenMP data"); DeviceTy &Device = PM->Devices[device_id]; - int rc = targetDataUpdate(Device, arg_num, args_base, args, arg_sizes, + int rc = targetDataUpdate(loc, Device, arg_num, args_base, args, arg_sizes, arg_types, arg_names, arg_mappers); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); } @@ -320,7 +320,7 @@ void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { - TIMESCOPE(); + TIMESCOPE_WITH_IDENT(loc); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc)); @@ -351,7 +351,7 @@ int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers) { - TIMESCOPE(); + TIMESCOPE_WITH_IDENT(loc); if (IsOffloadDisabled()) return OFFLOAD_FAIL; DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 "\n", DPxPTR(host_ptr), device_id); @@ -378,7 +378,7 @@ } #endif - int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes, + int rc = target(loc, device_id, host_ptr, arg_num, args_base, args, arg_sizes, arg_types, arg_names, arg_mappers, 0, 0, false /*team*/); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); return rc; @@ -389,7 +389,7 @@ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { - TIMESCOPE(); + TIMESCOPE_WITH_IDENT(loc); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc)); @@ -426,7 +426,6 @@ map_var_info_t *arg_names, void **arg_mappers, int32_t team_num, int32_t thread_limit) { - TIMESCOPE(); if (IsOffloadDisabled()) return OFFLOAD_FAIL; DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 "\n", DPxPTR(host_ptr), device_id); @@ -453,7 +452,7 @@ } #endif - int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes, + int rc = target(loc, device_id, host_ptr, arg_num, args_base, args, arg_sizes, arg_types, arg_names, arg_mappers, team_num, thread_limit, true /*team*/); HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); @@ -466,7 +465,7 @@ map_var_info_t *arg_names, void **arg_mappers, int32_t team_num, int32_t thread_limit, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList) { - TIMESCOPE(); + TIMESCOPE_WITH_IDENT(loc); if (depNum + noAliasDepNum > 0) __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc)); @@ -502,7 +501,7 @@ EXTERN void __kmpc_push_target_tripcount(ident_t *loc, int64_t device_id, uint64_t loop_tripcount) { - TIMESCOPE(); + TIMESCOPE_WITH_IDENT(loc); if (IsOffloadDisabled()) return; diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -51,7 +51,7 @@ static const int64_t Alignment = 8; /// Map global data and execute pending ctors -static int InitLibrary(DeviceTy& Device) { +static int InitLibrary(DeviceTy &Device) { /* * Map global data */ @@ -84,8 +84,8 @@ break; } // 2) load image into the target table. - __tgt_target_table *TargetTable = - TransTable->TargetsTable[device_id] = Device.load_binary(img); + __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] = + Device.load_binary(img); // Unable to get table for this image: invalidate image and fail. if (!TargetTable) { REPORT("Unable to generate entries table for device id %d.\n", device_id); @@ -129,8 +129,9 @@ if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size)) continue; DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu" - "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), - CurrDeviceEntry->size); + "\n", + DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), + CurrDeviceEntry->size); Device.HostDataToTargetMap.emplace( (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, @@ -158,8 +159,9 @@ DP("Has pending ctors... call now\n"); for (auto &entry : lib.second.PendingCtors) { void *ctor = entry; - int rc = target(device_id, ctor, 0, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, 1, 1, true /*team*/); + int rc = + target(nullptr, device_id, ctor, 0, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, 1, 1, true /*team*/); if (rc != OFFLOAD_SUCCESS) { REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); Device.PendingGlobalsMtx.unlock(); @@ -208,10 +210,11 @@ /// Call the user-defined mapper function followed by the appropriate // target_data_* function (target_data_{begin,end,update}). -int targetDataMapper(DeviceTy &Device, void *arg_base, void *arg, +int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg, int64_t arg_size, int64_t arg_type, map_var_info_t arg_names, void *arg_mapper, TargetDataFuncPtrTy target_data_function) { + TIMESCOPE_WITH_IDENT(loc); DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper)); // The mapper function fills up Components. @@ -240,7 +243,7 @@ MapperArgNames[I] = C.Name; } - int rc = target_data_function(Device, MapperComponents.Components.size(), + int rc = target_data_function(loc, Device, MapperComponents.Components.size(), MapperArgsBase.data(), MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(), MapperArgNames.data(), /*arg_mappers*/ nullptr, @@ -250,10 +253,10 @@ } /// Internal function to do the mapping and transfer the data to the device -int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types, - map_var_info_t *arg_names, void **arg_mappers, - __tgt_async_info *async_info_ptr) { +int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, map_var_info_t *arg_names, + void **arg_mappers, __tgt_async_info *async_info_ptr) { // process each input. for (int32_t i = 0; i < arg_num; ++i) { // Ignore private variables and arrays - there is no mapping for them. @@ -268,7 +271,7 @@ DP("Calling targetDataMapper for the %dth argument\n", i); map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i]; - int rc = targetDataMapper(Device, args_base[i], args[i], arg_sizes[i], + int rc = targetDataMapper(loc, Device, args_base[i], args[i], arg_sizes[i], arg_types[i], arg_name, arg_mappers[i], targetDataBegin); @@ -291,14 +294,15 @@ // Look at the next argument - if that is MEMBER_OF this one, then this one // is a combined entry. int64_t padding = 0; - const int next_i = i+1; + const int next_i = i + 1; if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num && getParentIndex(arg_types[next_i]) == i) { padding = (int64_t)HstPtrBegin % Alignment; if (padding) { DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD - "\n", padding, DPxPTR(HstPtrBegin)); - HstPtrBegin = (char *) HstPtrBegin - padding; + "\n", + padding, DPxPTR(HstPtrBegin)); + HstPtrBegin = (char *)HstPtrBegin - padding; data_size += padding; } } @@ -344,8 +348,9 @@ return OFFLOAD_FAIL; } DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" - "\n", sizeof(void *), DPxPTR(PointerTgtPtrBegin), - (Pointer_IsNew ? "" : " not")); + "\n", + sizeof(void *), DPxPTR(PointerTgtPtrBegin), + (Pointer_IsNew ? "" : " not")); Pointer_HstPtrBegin = HstPtrBase; // modify current entry. HstPtrBase = *(void **)HstPtrBase; @@ -364,8 +369,8 @@ return OFFLOAD_FAIL; } DP("There are %" PRId64 " bytes allocated at target address " DPxMOD - " - is%s new\n", data_size, DPxPTR(TgtPtrBegin), - (IsNew ? "" : " not")); + " - is%s new\n", + data_size, DPxPTR(TgtPtrBegin), (IsNew ? "" : " not")); if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; @@ -449,10 +454,10 @@ } // namespace /// Internal function to undo the mapping and retrieve the data from the device. -int targetDataEnd(DeviceTy &Device, int32_t ArgNum, void **ArgBases, - void **Args, int64_t *ArgSizes, int64_t *ArgTypes, - map_var_info_t *ArgNames, void **ArgMappers, - __tgt_async_info *AsyncInfo) { +int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, + void **ArgBases, void **Args, int64_t *ArgSizes, + int64_t *ArgTypes, map_var_info_t *ArgNames, + void **ArgMappers, __tgt_async_info *AsyncInfo) { int Ret; std::vector DeallocTgtPtrs; // process each input. @@ -471,7 +476,7 @@ map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; Ret = - targetDataMapper(Device, ArgBases[I], Args[I], ArgSizes[I], + targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I], ArgTypes[I], ArgName, ArgMappers[I], targetDataEnd); if (Ret != OFFLOAD_SUCCESS) { @@ -646,9 +651,10 @@ return OFFLOAD_SUCCESS; } -static int targetDataContiguous(DeviceTy &Device, void *ArgsBase, +static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase, void *HstPtrBegin, int64_t ArgSize, int64_t ArgType) { + TIMESCOPE_WITH_IDENT(loc); bool IsLast, IsHostPtr; void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false, IsHostPtr, /*MustContain=*/true); @@ -732,11 +738,13 @@ return OFFLOAD_SUCCESS; } -static int targetDataNonContiguous(DeviceTy &Device, void *ArgsBase, +static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device, + void *ArgsBase, __tgt_target_non_contig *NonContig, uint64_t Size, int64_t ArgType, int CurrentDim, int DimSize, uint64_t Offset) { + TIMESCOPE_WITH_IDENT(loc); int Ret = OFFLOAD_SUCCESS; if (CurrentDim < DimSize) { for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) { @@ -745,7 +753,7 @@ // we only need to transfer the first element for the last dimension // since we've already got a contiguous piece. if (CurrentDim != DimSize - 1 || I == 0) { - Ret = targetDataNonContiguous(Device, ArgsBase, NonContig, Size, + Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size, ArgType, CurrentDim + 1, DimSize, Offset + CurOffset); // Stop the whole process if any contiguous piece returns anything @@ -758,7 +766,7 @@ char *Ptr = (char *)ArgsBase + Offset; DP("Transfer of non-contiguous : host ptr %lx offset %ld len %ld\n", (uint64_t)Ptr, Offset, Size); - Ret = targetDataContiguous(Device, ArgsBase, Ptr, Size, ArgType); + Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType); } return Ret; } @@ -776,10 +784,10 @@ /// Internal function to pass data to/from the target. // async_info_ptr is currently unused, added here so targetDataUpdate has the // same signature as targetDataBegin and targetDataEnd. -int targetDataUpdate(DeviceTy &Device, int32_t ArgNum, void **ArgsBase, - void **Args, int64_t *ArgSizes, int64_t *ArgTypes, - map_var_info_t *ArgNames, void **ArgMappers, - __tgt_async_info *AsyncInfoPtr) { +int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum, + void **ArgsBase, void **Args, int64_t *ArgSizes, + int64_t *ArgTypes, map_var_info_t *ArgNames, + void **ArgMappers, __tgt_async_info *AsyncInfoPtr) { // process each input. for (int32_t I = 0; I < ArgNum; ++I) { if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || @@ -793,7 +801,7 @@ DP("Calling targetDataMapper for the %dth argument\n", I); map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; - int Ret = targetDataMapper(Device, ArgsBase[I], Args[I], ArgSizes[I], + int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], ArgTypes[I], ArgName, ArgMappers[I], targetDataUpdate); @@ -816,10 +824,10 @@ NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride; int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); Ret = targetDataNonContiguous( - Device, ArgsBase[I], NonContig, Size, ArgTypes[I], + loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I], /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0); } else { - Ret = targetDataContiguous(Device, ArgsBase[I], Args[I], ArgSizes[I], + Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], ArgTypes[I]); } if (Ret == OFFLOAD_FAIL) @@ -1063,16 +1071,18 @@ /// Process data before launching the kernel, including calling targetDataBegin /// to map and transfer data to target device, transferring (first-)private /// variables. -int processDataBefore(int64_t DeviceId, void *HostPtr, int32_t ArgNum, - void **ArgBases, void **Args, int64_t *ArgSizes, - int64_t *ArgTypes, map_var_info_t *ArgNames, - void **ArgMappers, std::vector &TgtArgs, - std::vector &TgtOffsets, - PrivateArgumentManagerTy &PrivateArgumentManager, - __tgt_async_info *AsyncInfo) { +static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr, + int32_t ArgNum, void **ArgBases, void **Args, + int64_t *ArgSizes, int64_t *ArgTypes, + map_var_info_t *ArgNames, void **ArgMappers, + std::vector &TgtArgs, + std::vector &TgtOffsets, + PrivateArgumentManagerTy &PrivateArgumentManager, + __tgt_async_info *AsyncInfo) { + TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc); DeviceTy &Device = PM->Devices[DeviceId]; - int Ret = targetDataBegin(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes, - ArgNames, ArgMappers, AsyncInfo); + int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes, + ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Call to targetDataBegin failed, abort target.\n"); return OFFLOAD_FAIL; @@ -1184,17 +1194,18 @@ /// Process data after launching the kernel, including transferring data back to /// host if needed and deallocating target memory of (first-)private variables. -int processDataAfter(int64_t DeviceId, void *HostPtr, int32_t ArgNum, - void **ArgBases, void **Args, int64_t *ArgSizes, - int64_t *ArgTypes, map_var_info_t *ArgNames, - void **ArgMappers, - PrivateArgumentManagerTy &PrivateArgumentManager, - __tgt_async_info *AsyncInfo) { +static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr, + int32_t ArgNum, void **ArgBases, void **Args, + int64_t *ArgSizes, int64_t *ArgTypes, + map_var_info_t *ArgNames, void **ArgMappers, + PrivateArgumentManagerTy &PrivateArgumentManager, + __tgt_async_info *AsyncInfo) { + TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc); DeviceTy &Device = PM->Devices[DeviceId]; // Move data from device. - int Ret = targetDataEnd(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes, - ArgNames, ArgMappers, AsyncInfo); + int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes, + ArgTypes, ArgNames, ArgMappers, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { REPORT("Call to targetDataEnd failed, abort target.\n"); return OFFLOAD_FAIL; @@ -1217,8 +1228,8 @@ /// performs the same action as data_update and data_end above. This function /// returns 0 if it was able to transfer the execution to a target and an /// integer different from zero otherwise. -int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases, - void **Args, int64_t *ArgSizes, int64_t *ArgTypes, +int target(ident_t *loc, int64_t DeviceId, void *HostPtr, int32_t ArgNum, + void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit, int IsTeamConstruct) { DeviceTy &Device = PM->Devices[DeviceId]; @@ -1248,13 +1259,16 @@ PrivateArgumentManagerTy PrivateArgumentManager(Device, &AsyncInfo); - // Process data, such as data mapping, before launching the kernel - int Ret = processDataBefore(DeviceId, HostPtr, ArgNum, ArgBases, Args, - ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs, - TgtOffsets, PrivateArgumentManager, &AsyncInfo); - if (Ret != OFFLOAD_SUCCESS) { - REPORT("Failed to process data before launching the kernel.\n"); - return OFFLOAD_FAIL; + int Ret; + if (ArgNum) { + // Process data, such as data mapping, before launching the kernel + Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, + ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs, + TgtOffsets, PrivateArgumentManager, &AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Failed to process data before launching the kernel.\n"); + return OFFLOAD_FAIL; + } } // Get loop trip count @@ -1265,27 +1279,33 @@ DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index); - if (IsTeamConstruct) - Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], - TgtArgs.size(), TeamNum, ThreadLimit, - LoopTripCount, &AsyncInfo); - else - Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], - TgtArgs.size(), &AsyncInfo); + { + TIMESCOPE_WITH_NAME_AND_IDENT( + IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc); + if (IsTeamConstruct) + Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], + TgtArgs.size(), TeamNum, ThreadLimit, + LoopTripCount, &AsyncInfo); + else + Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], + TgtArgs.size(), &AsyncInfo); + } if (Ret != OFFLOAD_SUCCESS) { REPORT("Executing target region abort target.\n"); return OFFLOAD_FAIL; } - // Transfer data back and deallocate target memory for (first-)private - // variables - Ret = processDataAfter(DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes, - ArgTypes, ArgNames, ArgMappers, PrivateArgumentManager, - &AsyncInfo); - if (Ret != OFFLOAD_SUCCESS) { - REPORT("Failed to process data after launching the kernel.\n"); - return OFFLOAD_FAIL; + if (ArgNum) { + // Transfer data back and deallocate target memory for (first-)private + // variables + Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, + ArgSizes, ArgTypes, ArgNames, ArgMappers, + PrivateArgumentManager, &AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + REPORT("Failed to process data after launching the kernel.\n"); + return OFFLOAD_FAIL; + } } return OFFLOAD_SUCCESS; diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -19,22 +19,24 @@ #include -extern int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types, - map_var_info_t *arg_names, void **arg_mappers, +extern int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, map_var_info_t *arg_names, + void **arg_mappers, __tgt_async_info *async_info_ptr); -extern int targetDataEnd(DeviceTy &Device, int32_t ArgNum, void **ArgBases, - void **Args, int64_t *ArgSizes, int64_t *ArgTypes, - map_var_info_t *arg_names, void **ArgMappers, - __tgt_async_info *AsyncInfo); +extern int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, + void **ArgBases, void **Args, int64_t *ArgSizes, + int64_t *ArgTypes, map_var_info_t *arg_names, + void **ArgMappers, __tgt_async_info *AsyncInfo); -extern int targetDataUpdate(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types, - map_var_info_t *arg_names, void **arg_mappers, +extern int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, map_var_info_t *arg_names, + void **arg_mappers, __tgt_async_info *async_info_ptr = nullptr); -extern int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, +extern int target(ident_t *loc, int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *arg_names, void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit, @@ -73,9 +75,10 @@ // Function pointer type for target_data_* functions (targetDataBegin, // targetDataEnd and targetDataUpdate). -typedef int (*TargetDataFuncPtrTy)(DeviceTy &, int32_t, void **, void **, - int64_t *, int64_t *, map_var_info_t *, - void **, __tgt_async_info *); +typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **, + void **, int64_t *, int64_t *, + map_var_info_t *, void **, + __tgt_async_info *); // Implemented in libomp, they are called from within __tgt_* functions. #ifdef __cplusplus @@ -157,8 +160,16 @@ #ifdef OMPTARGET_PROFILE_ENABLED #include "llvm/Support/TimeProfiler.h" #define TIMESCOPE() llvm::TimeTraceScope TimeScope(__FUNCTION__) +#define TIMESCOPE_WITH_IDENT(IDENT) \ + SourceInfo SI(IDENT); \ + llvm::TimeTraceScope TimeScope(__FUNCTION__, SI.getProfileLocation()) +#define TIMESCOPE_WITH_NAME_AND_IDENT(NAME, IDENT) \ + SourceInfo SI(IDENT); \ + llvm::TimeTraceScope TimeScope(NAME, SI.getProfileLocation()) #else #define TIMESCOPE() +#define TIMESCOPE_WITH_IDENT(IDENT) +#define TIMESCOPE_WITH_NAME_AND_IDENT(NAME IDENT) #endif #endif diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -396,8 +396,9 @@ Device.PendingGlobalsMtx.lock(); if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) { for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) { - int rc = target(Device.DeviceID, dtor, 0, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, 1, 1, true /*team*/); + int rc = + target(nullptr, Device.DeviceID, dtor, 0, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, 1, 1, true /*team*/); if (rc != OFFLOAD_SUCCESS) { DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor)); }