diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -109,6 +109,17 @@ TARGET_ALLOC_DEFAULT }; +/// This struct contains all of the arguments to a target kernel region launch. +struct __tgt_kernel_arguments { + int32_t NumArgs; // Number of arguments in each input pointer. + void **ArgBasePtrs; // Base pointer of each argument (e.g. a struct). + void **ArgPtrs; // Pointer to the argument data. + int64_t *ArgSizes; // Size of the argument data in bytes. + int64_t *ArgTypes; // Type of the data (e.g. to / from). + void **ArgNames; // Name of the data for debugging, possibly null. + void **ArgMappers; // User-defined mappers, possible null. +}; + /// This struct is a record of an entry point or global. For a function /// entry point the size is expected to be zero struct __tgt_offload_entry { @@ -353,6 +364,14 @@ map_var_info_t *arg_names, void **arg_mappers, int32_t num_teams, int32_t thread_limit, int32_t depNum, void *depList, int32_t noAliasDepNum, void *noAliasDepList); +int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, + int32_t ThreadLimit, void *HostPtr, + __tgt_kernel_arguments *Args, int32_t Version); +int __tgt_target_kernel_nowait(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, + int32_t ThreadLimit, void *HostPtr, + __tgt_kernel_arguments *Args, int32_t Version, + int32_t DepNum, void *DepList, + int32_t NoAliasDepNum, void *NoAliasDepList); void __kmpc_push_target_tripcount(int64_t device_id, uint64_t loop_tripcount); diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -24,6 +24,8 @@ __tgt_target_data_update_nowait_mapper; __tgt_target_nowait_mapper; __tgt_target_teams_nowait_mapper; + __tgt_target_kernel; + __tgt_target_kernel_nowait; __tgt_mapper_num_components; __tgt_push_mapper_component; __kmpc_push_target_tripcount; diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -279,36 +279,9 @@ int64_t *arg_sizes, int64_t *arg_types, map_var_info_t *arg_names, void **arg_mappers) { TIMESCOPE_WITH_IDENT(loc); - DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 - "\n", - DPxPTR(host_ptr), device_id); - if (checkDeviceAndCtors(device_id, loc)) { - DP("Not offloading to device %" PRId64 "\n", device_id); - return OMP_TGT_FAIL; - } - - if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) - printKernelArguments(loc, device_id, arg_num, arg_sizes, arg_types, - arg_names, "Entering OpenMP kernel"); -#ifdef OMPTARGET_DEBUG - for (int i = 0; i < arg_num; ++i) { - DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 - ", Type=0x%" PRIx64 ", Name=%s\n", - i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i], - (arg_names) ? getNameFromMapping(arg_names[i]).c_str() : "unknown"); - } -#endif - - DeviceTy &Device = *PM->Devices[device_id]; - AsyncInfoTy AsyncInfo(Device); - int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers, 0, 0, false /*team*/, - AsyncInfo); - if (rc == OFFLOAD_SUCCESS) - rc = AsyncInfo.synchronize(); - handleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); - assert(rc == OFFLOAD_SUCCESS && "__tgt_target_mapper unexpected failure!"); - return OMP_TGT_SUCCESS; + __tgt_kernel_arguments Args{arg_num, args_base, args, arg_sizes, + arg_types, arg_names, arg_mappers}; + return __tgt_target_kernel(loc, device_id, -1, -1, host_ptr, &Args, 1); } EXTERN int __tgt_target_nowait_mapper( @@ -353,50 +326,96 @@ map_var_info_t *arg_names, void **arg_mappers, int32_t team_num, int32_t thread_limit) { + TIMESCOPE_WITH_IDENT(loc); + + __tgt_kernel_arguments Args{arg_num, args_base, args, arg_sizes, + arg_types, arg_names, arg_mappers}; + return __tgt_target_kernel(loc, device_id, team_num, thread_limit, host_ptr, + &Args, 1); +} + +EXTERN int __tgt_target_teams_nowait_mapper( + ident_t *loc, int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, + map_var_info_t *arg_names, void **arg_mappers, int32_t team_num, + int32_t thread_limit, int32_t depNum, void *depList, int32_t noAliasDepNum, + void *noAliasDepList) { + TIMESCOPE_WITH_IDENT(loc); + + return __tgt_target_teams_mapper(loc, device_id, host_ptr, arg_num, args_base, + args, arg_sizes, arg_types, arg_names, + arg_mappers, team_num, thread_limit); +} + +/// Implements a kernel entry that executes the target region on the specified +/// device. +/// +/// \param Loc Source location associated with this target region. +/// \param DeviceId The device to execute this region, -1 indicated the default. +/// \param NumTeams Number of teams to launch the region with, -1 indicates a +/// non-teams region and 0 indicates it was unspecified. +/// \param NumTeams Limit to the number of threads to use in the kernel launch, +/// 0 indicates it was unspecified. +/// \param HostPtr The pointer to the host function registered with the kernel. +/// \param Args All arguments to this kernel launch (see struct definition). +/// \param Version Version identifier used for backwards-compatibility. +EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, + int32_t ThreadLimit, void *HostPtr, + __tgt_kernel_arguments *Args, int32_t Version) { + TIMESCOPE_WITH_IDENT(Loc); DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 "\n", - DPxPTR(host_ptr), device_id); - if (checkDeviceAndCtors(device_id, loc)) { - DP("Not offloading to device %" PRId64 "\n", device_id); + DPxPTR(HostPtr), DeviceId); + if (Version != 1) { + DP("Unexpected ABI version: %d\n", Version); + } + if (checkDeviceAndCtors(DeviceId, Loc)) { + DP("Not offloading to device %" PRId64 "\n", DeviceId); return OMP_TGT_FAIL; } if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) - printKernelArguments(loc, device_id, arg_num, arg_sizes, arg_types, - arg_names, "Entering OpenMP kernel"); + printKernelArguments(Loc, DeviceId, Args->NumArgs, Args->ArgSizes, + Args->ArgTypes, Args->ArgNames, + "Entering OpenMP kernel"); #ifdef OMPTARGET_DEBUG - for (int i = 0; i < arg_num; ++i) { + for (int i = 0; i < Args->NumArgs; ++i) { DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 ", Type=0x%" PRIx64 ", Name=%s\n", - i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i], - (arg_names) ? getNameFromMapping(arg_names[i]).c_str() : "unknown"); + i, DPxPTR(Args->ArgBasePtrs[i]), DPxPTR(Args->ArgPtrs[i]), + Args->ArgSizes[i], Args->ArgTypes[i], + (Args->ArgNames) ? getNameFromMapping(Args->ArgNames[i]).c_str() + : "unknown"); } #endif - DeviceTy &Device = *PM->Devices[device_id]; + bool IsTeams = NumTeams != -1; + if (!IsTeams) + NumTeams = 0; + + DeviceTy &Device = *PM->Devices[DeviceId]; AsyncInfoTy AsyncInfo(Device); - int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes, - arg_types, arg_names, arg_mappers, team_num, thread_limit, - true /*team*/, AsyncInfo); + int rc = target(Loc, Device, HostPtr, Args->NumArgs, Args->ArgBasePtrs, + Args->ArgPtrs, Args->ArgSizes, Args->ArgTypes, Args->ArgNames, + Args->ArgMappers, NumTeams, ThreadLimit, IsTeams, AsyncInfo); if (rc == OFFLOAD_SUCCESS) rc = AsyncInfo.synchronize(); - handleTargetOutcome(rc == OFFLOAD_SUCCESS, loc); - assert(rc == OFFLOAD_SUCCESS && - "__tgt_target_teams_mapper unexpected failure!"); + handleTargetOutcome(rc == OFFLOAD_SUCCESS, Loc); + assert(rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!"); return OMP_TGT_SUCCESS; } -EXTERN int __tgt_target_teams_nowait_mapper( - ident_t *loc, int64_t device_id, void *host_ptr, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - map_var_info_t *arg_names, void **arg_mappers, int32_t team_num, - int32_t thread_limit, int32_t depNum, void *depList, int32_t noAliasDepNum, - void *noAliasDepList) { - TIMESCOPE_WITH_IDENT(loc); - - return __tgt_target_teams_mapper(loc, device_id, host_ptr, arg_num, args_base, - args, arg_sizes, arg_types, arg_names, - arg_mappers, team_num, thread_limit); +EXTERN int __tgt_target_kernel_nowait(ident_t *Loc, int64_t DeviceId, + int32_t NumTeams, int32_t ThreadLimit, + void *HostPtr, + __tgt_kernel_arguments *Args, + int32_t Version, int32_t DepNum, + void *DepList, int32_t NoAliasDepNum, + void *NoAliasDepList) { + TIMESCOPE_WITH_IDENT(Loc); + + return __tgt_target_kernel(Loc, DeviceId, NumTeams, ThreadLimit, HostPtr, + Args, 1); } // Get the current number of components for a user-defined mapper.