diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -153,7 +153,7 @@ return lr; } -// Used by target_data_begin +// Used by targetDataBegin // Return the target pointer begin (where the data will be moved). // Allocate memory if this is the first occurrence of this mapping. // Increment the reference counter. @@ -232,9 +232,9 @@ return rc; } -// Used by target_data_begin, target_data_end, target_data_update and target. +// Used by targetDataBegin, targetDataEnd, target_data_update and target. // Return the target pointer begin (where the data will be moved). -// Decrement the reference counter if called from target_data_end. +// Decrement the reference counter if called from targetDataEnd. void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, bool UpdateRefCount, bool &IsHostPtr) { void *rc = NULL; diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -136,8 +136,8 @@ } #endif - int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_mappers, nullptr); + int rc = targetDataBegin(Device, arg_num, args_base, args, arg_sizes, + arg_types, arg_mappers, nullptr); HandleTargetOutcome(rc == OFFLOAD_SUCCESS); } @@ -207,8 +207,8 @@ } #endif - int rc = target_data_end(Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_mappers, nullptr); + int rc = targetDataEnd(Device, arg_num, args_base, args, arg_sizes, arg_types, + arg_mappers, nullptr); HandleTargetOutcome(rc == OFFLOAD_SUCCESS); } diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -251,9 +251,9 @@ } /// Internal function to do the mapping and transfer the data to the device -int target_data_begin(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types, - void **arg_mappers, __tgt_async_info *async_info_ptr) { +int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + void **arg_mappers, __tgt_async_info *async_info_ptr) { // process each input. for (int32_t i = 0; i < arg_num; ++i) { // Ignore private variables and arrays - there is no mapping for them. @@ -262,17 +262,18 @@ continue; if (arg_mappers && arg_mappers[i]) { - // Instead of executing the regular path of target_data_begin, call the - // target_data_mapper variant which will call target_data_begin again + // Instead of executing the regular path of targetDataBegin, call the + // target_data_mapper variant which will call targetDataBegin again // with new arguments. DP("Calling target_data_mapper for the %dth argument\n", i); - int rc = target_data_mapper(Device, args_base[i], args[i], arg_sizes[i], - arg_types[i], arg_mappers[i], target_data_begin); + int rc = + target_data_mapper(Device, args_base[i], args[i], arg_sizes[i], + arg_types[i], arg_mappers[i], targetDataBegin); if (rc != OFFLOAD_SUCCESS) { - DP("Call to target_data_begin via target_data_mapper for custom mapper" - " failed.\n"); + DP("Call to targetDataBegin via target_data_mapper for custom mapper" + " failed.\n"); return OFFLOAD_FAIL; } @@ -422,9 +423,9 @@ } /// Internal function to undo the mapping and retrieve the data from the device. -int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types, - void **arg_mappers, __tgt_async_info *async_info_ptr) { +int targetDataEnd(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + void **arg_mappers, __tgt_async_info *async_info_ptr) { // process each input. for (int32_t i = arg_num - 1; i >= 0; --i) { // Ignore private variables and arrays - there is no mapping for them. @@ -434,17 +435,17 @@ continue; if (arg_mappers && arg_mappers[i]) { - // Instead of executing the regular path of target_data_end, call the - // target_data_mapper variant which will call target_data_end again + // Instead of executing the regular path of targetDataEnd, call the + // target_data_mapper variant which will call targetDataEnd again // with new arguments. DP("Calling target_data_mapper for the %dth argument\n", i); int rc = target_data_mapper(Device, args_base[i], args[i], arg_sizes[i], - arg_types[i], arg_mappers[i], target_data_end); + arg_types[i], arg_mappers[i], targetDataEnd); if (rc != OFFLOAD_SUCCESS) { - DP("Call to target_data_end via target_data_mapper for custom mapper" - " failed.\n"); + DP("Call to targetDataEnd via target_data_mapper for custom mapper" + " failed.\n"); return OFFLOAD_FAIL; } @@ -591,7 +592,7 @@ /// Internal function to pass data to/from the target. // async_info_ptr is currently unused, added here so target_data_update has the -// same signature as target_data_begin and target_data_end. +// same signature as targetDataBegin and targetDataEnd. int target_data_update(DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, __tgt_async_info *async_info_ptr) { @@ -707,111 +708,101 @@ return (Mapping & LambdaMapping) == LambdaMapping; } -/// performs the same actions as data_begin in case arg_num is -/// non-zero and initiates run of the offloaded region on the target platform; -/// if arg_num is non-zero after the region execution is done it also -/// performs the same action as data_update and data_end above. This function -/// returns 0 if it was able to transfer the execution to a target and an -/// integer different from zero otherwise. -int target(int64_t device_id, void *host_ptr, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - void **arg_mappers, int32_t team_num, int32_t thread_limit, - int IsTeamConstruct) { - DeviceTy &Device = Devices[device_id]; - - // Find the table information in the map or look it up in the translation - // tables. - TableMap *TM = 0; - TblMapMtx->lock(); - HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap->find(host_ptr); - if (TableMapIt == HostPtrToTableMap->end()) { - // We don't have a map. So search all the registered libraries. - TrlTblMtx->lock(); - for (HostEntriesBeginToTransTableTy::iterator - ii = HostEntriesBeginToTransTable->begin(), - ie = HostEntriesBeginToTransTable->end(); - !TM && ii != ie; ++ii) { - // get the translation table (which contains all the good info). - TranslationTable *TransTable = &ii->second; - // iterate over all the host table entries to see if we can locate the - // host_ptr. - __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin; - __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd; - __tgt_offload_entry *cur = begin; - for (uint32_t i = 0; cur < end; ++cur, ++i) { - if (cur->addr != host_ptr) - continue; - // we got a match, now fill the HostPtrToTableMap so that we - // may avoid this search next time. - TM = &(*HostPtrToTableMap)[host_ptr]; - TM->Table = TransTable; - TM->Index = i; - break; - } +namespace { +TableMap *findTableMap(void *HostPtr) { + std::lock_guard TblMapLock(*TblMapMtx); + HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap->find(HostPtr); + if (TableMapIt != HostPtrToTableMap->end()) + return &TableMapIt->second; + + // We don't have a map. So search all the registered libraries. + std::lock_guard TrlTblLock(*TrlTblMtx); + for (HostEntriesBeginToTransTableTy::iterator Itr = + HostEntriesBeginToTransTable->begin(); + Itr != HostEntriesBeginToTransTable->end(); ++Itr) { + // Get the translation table (which contains all the good info). + TranslationTable *TransTable = &Itr->second; + // Iterate over all the host table entries to see if we can locate the + // HostPtr. + __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin; + for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) { + if (Cur->addr != HostPtr) + continue; + // We got a match. Now fill the HostPtrToTableMap so that we may avoid + // this search next time. + TableMap *TM = &(*HostPtrToTableMap)[HostPtr]; + TM->Table = TransTable; + TM->Index = I; + return TM; } - TrlTblMtx->unlock(); - } else { - TM = &TableMapIt->second; - } - TblMapMtx->unlock(); - - // No map for this host pointer found! - if (!TM) { - DP("Host ptr " DPxMOD " does not have a matching target pointer.\n", - DPxPTR(host_ptr)); - return OFFLOAD_FAIL; } - // get target table. - TrlTblMtx->lock(); - assert(TM->Table->TargetsTable.size() > (size_t)device_id && - "Not expecting a device ID outside the table's bounds!"); - __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id]; - TrlTblMtx->unlock(); - assert(TargetTable && "Global data has not been mapped\n"); + return nullptr; +} - __tgt_async_info AsyncInfo; +// FIXME: This function will not work properly when calling +// __kmpc_push_target_tripcount in one thread but fetching loop trip count in +// another thread. This will happen if we introduce task yield in the future for +// better performance. +inline uint64_t getLoopTripCount(int64_t DeviceId) { + DeviceTy &Device = Devices[DeviceId]; + std::lock_guard TblMapLock(*TblMapMtx); + std::map::iterator I = + Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL)); + if (I != Device.LoopTripCnt.end()) { + uint64_t ltc = 0; + ltc = I->second; + Device.LoopTripCnt.erase(I); + DP("loop trip count is %lu.\n", ltc); + return ltc; + } + return 0; +} - // Move data to device. - int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_mappers, &AsyncInfo); - if (rc != OFFLOAD_SUCCESS) { - DP("Call to target_data_begin failed, abort target.\n"); +// Pre-launch data processing, mainly mapping data to the device side. +int preDataProcess(int64_t DeviceId, int32_t ArgNum, void **ArgsBase, + void **Args, int64_t *ArgSizes, int64_t *ArgTypes, + void **ArgMappers, std::vector &TgtArgs, + std::vector &TgtOffsets, + std::vector &FPArrays, __tgt_async_info *AsyncInfo) { + DeviceTy &Device = Devices[DeviceId]; + // Invoke targetDataBegin to copy non-literal and non-private data to target + // device + int Ret = targetDataBegin(Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, + ArgMappers, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + DP("Call to targetDataBegin failed, abort target.\n"); return OFFLOAD_FAIL; } - std::vector tgt_args; - std::vector tgt_offsets; - // List of (first-)private arrays allocated for this target region - std::vector fpArrays; - std::vector tgtArgsPositions(arg_num, -1); + std::vector TgtArgsPositions(ArgNum, -1); - for (int32_t i = 0; i < arg_num; ++i) { - if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { - // This is not a target parameter, do not push it into tgt_args. + for (int32_t I = 0; I < ArgNum; ++I) { + if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { + // This is not a target parameter, do not push it into TgtArgs. // Check for lambda mapping. - if (isLambdaMapping(arg_types[i])) { - assert((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && + if (isLambdaMapping(ArgTypes[I])) { + assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && "PTR_AND_OBJ must be also MEMBER_OF."); - unsigned idx = member_of(arg_types[i]); - int tgtIdx = tgtArgsPositions[idx]; - assert(tgtIdx != -1 && "Base address must be translated already."); + unsigned Idx = member_of(ArgTypes[I]); + int TgtIdx = TgtArgsPositions[Idx]; + assert(TgtIdx != -1 && "Base address must be translated already."); // The parent lambda must be processed already and it must be the last - // in tgt_args and tgt_offsets arrays. - void *HstPtrVal = args[i]; - void *HstPtrBegin = args_base[i]; - void *HstPtrBase = args[idx]; + // in TgtArgs and TgtOffsets arrays. + void *HstPtrVal = Args[I]; + void *HstPtrBegin = ArgsBase[I]; + void *HstPtrBase = Args[Idx]; bool IsLast, IsHostPtr; // unused. void *TgtPtrBase = - (void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]); + (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]); DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); - void *Pointer_TgtPtrBegin = - Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false, - IsHostPtr); - if (!Pointer_TgtPtrBegin) { + void *PointerTgtPtrBegin = + Device.getTgtPtrBegin(HstPtrVal, ArgSizes[I], IsLast, + /* UpdateRefCount */ false, IsHostPtr); + if (!PointerTgtPtrBegin) { DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", DPxPTR(HstPtrVal)); continue; @@ -819,127 +810,193 @@ if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && TgtPtrBegin == HstPtrBegin) { DP("Unified memory is active, no need to map lambda captured" - "variable (" DPxMOD ")\n", DPxPTR(HstPtrVal)); + "variable (" DPxMOD ")\n", + DPxPTR(HstPtrVal)); continue; } DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", - DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin, - sizeof(void *), &AsyncInfo); - if (rt != OFFLOAD_SUCCESS) { + DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); + Ret = Device.data_submit(TgtPtrBegin, &PointerTgtPtrBegin, + sizeof(void *), AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); return OFFLOAD_FAIL; } } continue; } - void *HstPtrBegin = args[i]; - void *HstPtrBase = args_base[i]; + void *HstPtrBegin = Args[I]; + void *HstPtrBase = ArgsBase[I]; void *TgtPtrBegin; ptrdiff_t TgtBaseOffset; bool IsLast, IsHostPtr; // unused. - if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) { + if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) { DP("Forwarding first-private value " DPxMOD " to the target construct\n", - DPxPTR(HstPtrBase)); + DPxPTR(HstPtrBase)); TgtPtrBegin = HstPtrBase; TgtBaseOffset = 0; - } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) { + } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { // Allocate memory for (first-)private array - TgtPtrBegin = Device.data_alloc(arg_sizes[i], HstPtrBegin); + TgtPtrBegin = Device.data_alloc(ArgSizes[I], HstPtrBegin); if (!TgtPtrBegin) { - DP ("Data allocation for %sprivate array " DPxMOD " failed, " - "abort target.\n", - (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), - DPxPTR(HstPtrBegin)); + DP("Data allocation for %sprivate array " DPxMOD " failed, " + "abort target.\n", + (ArgTypes[I] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), + DPxPTR(HstPtrBegin)); return OFFLOAD_FAIL; } - fpArrays.push_back(TgtPtrBegin); + FPArrays.push_back(TgtPtrBegin); TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; #ifdef OMPTARGET_DEBUG void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for " - "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", - arg_sizes[i], DPxPTR(TgtPtrBegin), - (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), - DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase)); + "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", + ArgSizes[I], DPxPTR(TgtPtrBegin), + (ArgTypes[I] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), + DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase)); #endif // If first-private, copy data from host - if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i], - &AsyncInfo); - if (rt != OFFLOAD_SUCCESS) { + if (ArgTypes[I] & OMP_TGT_MAPTYPE_TO) { + Ret = Device.data_submit(TgtPtrBegin, HstPtrBegin, ArgSizes[I], + AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { DP("Copying data to device failed, failed.\n"); return OFFLOAD_FAIL; } } - } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { + } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast, - false, IsHostPtr); + false, IsHostPtr); TgtBaseOffset = 0; // no offset for ptrs. DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to " - "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase), - DPxPTR(HstPtrBase)); + "object " DPxMOD "\n", + DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase), DPxPTR(HstPtrBase)); } else { - TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast, - false, IsHostPtr); + TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast, + false, IsHostPtr); TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; #ifdef OMPTARGET_DEBUG void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", - DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); + DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); #endif } - tgtArgsPositions[i] = tgt_args.size(); - tgt_args.push_back(TgtPtrBegin); - tgt_offsets.push_back(TgtBaseOffset); + TgtArgsPositions[I] = TgtArgs.size(); + TgtArgs.push_back(TgtPtrBegin); + TgtOffsets.push_back(TgtBaseOffset); } - assert(tgt_args.size() == tgt_offsets.size() && - "Size mismatch in arguments and offsets"); + assert(TgtArgs.size() == TgtOffsets.size() && + "Size mismatch in arguments and offsets"); - // Pop loop trip count - uint64_t ltc = 0; - TblMapMtx->lock(); - auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL)); - if (I != Device.LoopTripCnt.end()) { - ltc = I->second; - Device.LoopTripCnt.erase(I); - DP("loop trip count is %lu.\n", ltc); - } - TblMapMtx->unlock(); + return OFFLOAD_SUCCESS; +} - // Launch device execution. - DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", - TargetTable->EntriesBegin[TM->Index].name, - DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index); - if (IsTeamConstruct) { - rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr, - &tgt_args[0], &tgt_offsets[0], tgt_args.size(), - team_num, thread_limit, ltc, &AsyncInfo); - } else { - rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr, - &tgt_args[0], &tgt_offsets[0], tgt_args.size(), - &AsyncInfo); - } - if (rc != OFFLOAD_SUCCESS) { - DP ("Executing target region abort target.\n"); +// Post-launch data processing, including: +// - Deallocate all private variables we create; +// - Move data from device back to host if needed +// FIXME: This function contains historic correctness issue, which is that +// target data might be deallocated before the kernel is started. Even if the +// kernel is started, the invocation of free memory will affect performance. +int postDataProcess(int64_t DeviceId, int32_t ArgNum, void **ArgsBase, + void **Args, int64_t *ArgSizes, int64_t *ArgTypes, + void **ArgMappers, std::vector &FPArrays, + __tgt_async_info *AsyncInfo) { + DeviceTy &Device = Devices[DeviceId]; + + // Move data from device. + int Ret = targetDataEnd(Device, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, + ArgMappers, AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + DP("Call to targetDataEnd failed, abort targe.\n"); return OFFLOAD_FAIL; } // Deallocate (first-)private arrays - for (auto it : fpArrays) { - int rt = Device.data_delete(it); - if (rt != OFFLOAD_SUCCESS) { + for (void *TgtPtr : FPArrays) { + Ret = Device.data_delete(TgtPtr); + if (Ret != OFFLOAD_SUCCESS) { DP("Deallocation of (first-)private arrays failed.\n"); return OFFLOAD_FAIL; } } - // Move data from device. - int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes, - arg_types, arg_mappers, &AsyncInfo); - if (rt != OFFLOAD_SUCCESS) { - DP("Call to target_data_end failed, abort targe.\n"); + return OFFLOAD_SUCCESS; +} +} // namespace + +/// Performs the same actions as data_begin in case ArgNum is +/// non-zero and initiates run of the offloaded region on the target platform; +/// if ArgNum is non-zero after the region execution is done it also +/// performs the same action as data_update and data_end above. This function +/// returns 0 if it was able to transfer the execution to a target and an +/// integer different from zero otherwise. +int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgsBase, + void **Args, int64_t *ArgSizes, int64_t *ArgTypes, void **ArgMappers, + int32_t TeamNum, int32_t ThreadLimit, int IsTeamConstruct) { + // Find the table information in the map or look it up in the translation + // tables. + TableMap *TM = findTableMap(HostPtr); + + // No map for this host pointer found! + if (!TM) { + DP("Host ptr " DPxMOD " does not have a matching target pointer.\n", + DPxPTR(HostPtr)); + return OFFLOAD_FAIL; + } + + // Get target table. + __tgt_target_table *TargetTable; + { + std::lock_guard TrlTblLock(*TrlTblMtx); + assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && + "Not expecting a device ID outside the table's bounds!"); + TargetTable = TM->Table->TargetsTable[DeviceId]; + } + assert(TargetTable && "Global data has not been mapped\n"); + + __tgt_async_info AsyncInfo; + + std::vector TgtArgs; + std::vector TgtOffsets; + std::vector FPArrays; + + // Process data before launching the kernel + int Ret = + preDataProcess(DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, + ArgMappers, TgtArgs, TgtOffsets, FPArrays, &AsyncInfo); + + if (Ret != OFFLOAD_SUCCESS) { + DP("Failed to process data before kernel launch.\n"); + return OFFLOAD_FAIL; + } + + // Pop loop trip count + uint64_t LoopTripCount = getLoopTripCount(DeviceId); + + // Launch device execution. + DeviceTy &Device = Devices[DeviceId]; + void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; + DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", + TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index); + if (IsTeamConstruct) + Ret = Device.run_team_region(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], + TgtArgs.size(), TeamNum, ThreadLimit, + LoopTripCount, &AsyncInfo); + else + Ret = Device.run_region(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], + TgtArgs.size(), &AsyncInfo); + + if (Ret != OFFLOAD_SUCCESS) { + DP("Executing target region abort target.\n"); + return OFFLOAD_FAIL; + } + + // Process data after launching the kernel + Ret = postDataProcess(DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, + ArgMappers, FPArrays, &AsyncInfo); + if (Ret != OFFLOAD_SUCCESS) { + DP("Failed to process data after kernel launch.\n"); return OFFLOAD_FAIL; } diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -17,26 +17,25 @@ #include -extern int target_data_begin(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, void **arg_mappers, - __tgt_async_info *async_info_ptr); - -extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, +extern int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, __tgt_async_info *async_info_ptr); +extern int targetDataEnd(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + void **arg_mappers, __tgt_async_info *async_info_ptr); + extern int target_data_update(DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, void **arg_mappers, __tgt_async_info *async_info_ptr = nullptr); -extern int target(int64_t device_id, void *host_ptr, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, void **arg_mappers, int32_t team_num, - int32_t thread_limit, int IsTeamConstruct); +extern int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, + void **ArgsBase, void **Args, int64_t *ArgSizes, + int64_t *ArgTypes, void **ArgMappers, int32_t TeamNum, + int32_t ThreadLimit, int IsTeamConstruct); extern int CheckDeviceAndCtors(int64_t device_id); @@ -74,8 +73,8 @@ // size_t size, int64_t type); typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t); -// Function pointer type for target_data_* functions (target_data_begin, -// target_data_end and target_data_update). +// Function pointer type for target_data_* functions (targetDataBegin, +// targetDataEnd and target_data_update). typedef int (*TargetDataFuncPtrTy)(DeviceTy &, int32_t, void **, void **, int64_t *, int64_t *, void **, __tgt_async_info *);