diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -109,6 +109,18 @@ TARGET_ALLOC_DEFAULT }; +/// This struct contains all of the arguments to a target kernel region launch. +struct __tgt_kernel_arguments { + int32_t Version; // Version of this struct for ABI compatibility. + int32_t NumArgs; // Number of arguments in each input pointer. + void **ArgBasePtrs; // Base pointer of each argument (e.g. a struct). + void **ArgPtrs; // Pointer to the argument data. + int64_t *ArgSizes; // Size of the argument data in bytes. + int64_t *ArgTypes; // Type of the data (e.g. to / from). + void **ArgNames; // Name of the data for debugging, possibly null. + void **ArgMappers; // User-defined mappers, possible null. +}; + /// This struct is a record of an entry point or global. For a function /// entry point the size is expected to be zero struct __tgt_offload_entry { @@ -345,11 +357,19 @@ map_var_info_t *ArgNames, void **ArgMappers, int32_t NumTeams, int32_t ThreadLimit); int __tgt_target_teams_nowait_mapper( - ident_t *Loc, int64_t DeviceId, void *HostPtr, int32_t ArgNum, - void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, - map_var_info_t *ArgNames, void **ArgMappers, int32_t NumTeams, - int32_t ThreadLimit, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, - void *NoAliasDepList); + ident_t *loc, int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, + map_var_info_t *arg_names, void **arg_mappers, int32_t num_teams, + int32_t thread_limit, int32_t depNum, void *depList, int32_t noAliasDepNum, + void *noAliasDepList); +int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, + int32_t ThreadLimit, void *HostPtr, + __tgt_kernel_arguments *Args); +int __tgt_target_kernel_nowait(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, + int32_t ThreadLimit, void *HostPtr, + __tgt_kernel_arguments *Args, int32_t DepNum, + void *DepList, int32_t NoAliasDepNum, + void *NoAliasDepList); void __kmpc_push_target_tripcount(int64_t DeviceId, uint64_t LoopTripcount); diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -24,6 +24,8 @@ __tgt_target_data_update_nowait_mapper; __tgt_target_nowait_mapper; __tgt_target_teams_nowait_mapper; + __tgt_target_kernel; + __tgt_target_kernel_nowait; __tgt_mapper_num_components; __tgt_push_mapper_component; __kmpc_push_target_tripcount; diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -278,36 +278,9 @@ int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers) { TIMESCOPE_WITH_IDENT(Loc); - DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 - "\n", - DPxPTR(HostPtr), DeviceId); - if (checkDeviceAndCtors(DeviceId, Loc)) { - DP("Not offloading to device %" PRId64 "\n", DeviceId); - return OMP_TGT_FAIL; - } - - if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) - printKernelArguments(Loc, DeviceId, ArgNum, ArgSizes, ArgTypes, ArgNames, - "Entering OpenMP kernel"); -#ifdef OMPTARGET_DEBUG - for (int I = 0; I < ArgNum; ++I) { - DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 - ", Type=0x%" PRIx64 ", Name=%s\n", - I, DPxPTR(ArgsBase[I]), DPxPTR(Args[I]), ArgSizes[I], ArgTypes[I], - (ArgNames) ? getNameFromMapping(ArgNames[I]).c_str() : "unknown"); - } -#endif - - DeviceTy &Device = *PM->Devices[DeviceId]; - AsyncInfoTy AsyncInfo(Device); - int Rc = - target(Loc, Device, HostPtr, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, - ArgNames, ArgMappers, 0, 0, false /*team*/, AsyncInfo); - if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); - handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); - assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_mapper unexpected failure!"); - return OMP_TGT_SUCCESS; + __tgt_kernel_arguments KernelArgs{1, ArgNum, ArgsBase, Args, + ArgSizes, ArgTypes, ArgNames, ArgMappers}; + return __tgt_target_kernel(Loc, DeviceId, -1, 0, HostPtr, &KernelArgs); } EXTERN int __tgt_target_nowait_mapper( @@ -352,50 +325,78 @@ map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit) { + TIMESCOPE_WITH_IDENT(Loc); + __tgt_kernel_arguments KernelArgs{1, ArgNum, ArgsBase, Args, + ArgSizes, ArgTypes, ArgNames, ArgMappers}; + return __tgt_target_kernel(Loc, DeviceId, TeamNum, ThreadLimit, HostPtr, + &KernelArgs); +} + +/// Implements a kernel entry that executes the target region on the specified +/// device. +/// +/// \param Loc Source location associated with this target region. +/// \param DeviceId The device to execute this region, -1 indicated the default. +/// \param NumTeams Number of teams to launch the region with, -1 indicates a +/// non-teams region and 0 indicates it was unspecified. +/// \param ThreadLimit Limit to the number of threads to use in the kernel +/// launch, 0 indicates it was unspecified. +/// \param HostPtr The pointer to the host function registered with the kernel. +/// \param Args All arguments to this kernel launch (see struct definition). +EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, + int32_t ThreadLimit, void *HostPtr, + __tgt_kernel_arguments *Args) { + TIMESCOPE_WITH_IDENT(Loc); DP("Entering target region with entry point " DPxMOD " and device Id %" PRId64 "\n", DPxPTR(HostPtr), DeviceId); + if (Args->Version != 1) { + DP("Unexpected ABI version: %d\n", Args->Version); + } if (checkDeviceAndCtors(DeviceId, Loc)) { DP("Not offloading to device %" PRId64 "\n", DeviceId); return OMP_TGT_FAIL; } if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) - printKernelArguments(Loc, DeviceId, ArgNum, ArgSizes, ArgTypes, ArgNames, + printKernelArguments(Loc, DeviceId, Args->NumArgs, Args->ArgSizes, + Args->ArgTypes, Args->ArgNames, "Entering OpenMP kernel"); #ifdef OMPTARGET_DEBUG - for (int I = 0; I < ArgNum; ++I) { + for (int I = 0; I < Args->NumArgs; ++I) { DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 ", Type=0x%" PRIx64 ", Name=%s\n", - I, DPxPTR(ArgsBase[I]), DPxPTR(Args[I]), ArgSizes[I], ArgTypes[I], - (ArgNames) ? getNameFromMapping(ArgNames[I]).c_str() : "unknown"); + I, DPxPTR(Args->ArgBasePtrs[I]), DPxPTR(Args->ArgPtrs[I]), + Args->ArgSizes[I], Args->ArgTypes[I], + (Args->ArgNames) ? getNameFromMapping(Args->ArgNames[I]).c_str() + : "unknown"); } #endif + bool IsTeams = NumTeams != -1; + if (!IsTeams) + NumTeams = 0; + DeviceTy &Device = *PM->Devices[DeviceId]; AsyncInfoTy AsyncInfo(Device); - int Rc = target(Loc, Device, HostPtr, ArgNum, ArgsBase, Args, ArgSizes, - ArgTypes, ArgNames, ArgMappers, TeamNum, ThreadLimit, - true /*team*/, AsyncInfo); + int Rc = target(Loc, Device, HostPtr, Args->NumArgs, Args->ArgBasePtrs, + Args->ArgPtrs, Args->ArgSizes, Args->ArgTypes, Args->ArgNames, + Args->ArgMappers, NumTeams, ThreadLimit, IsTeams, AsyncInfo); if (Rc == OFFLOAD_SUCCESS) Rc = AsyncInfo.synchronize(); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); - assert(Rc == OFFLOAD_SUCCESS && - "__tgt_target_teams_mapper unexpected failure!"); + assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!"); return OMP_TGT_SUCCESS; } -EXTERN int __tgt_target_teams_nowait_mapper( - ident_t *Loc, int64_t DeviceId, void *HostPtr, int32_t ArgNum, - void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, - map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, - int32_t ThreadLimit, int32_t DepNum, void *DepList, int32_t NoAliasDepNum, - void *NoAliasDepList) { +EXTERN int __tgt_target_kernel_nowait( + ident_t *Loc, int64_t DeviceId, int32_t NumTeams, int32_t ThreadLimit, + void *HostPtr, __tgt_kernel_arguments *Args, int32_t DepNum, void *DepList, + int32_t NoAliasDepNum, void *NoAliasDepList) { TIMESCOPE_WITH_IDENT(Loc); - return __tgt_target_teams_mapper(Loc, DeviceId, HostPtr, ArgNum, ArgsBase, - Args, ArgSizes, ArgTypes, ArgNames, - ArgMappers, TeamNum, ThreadLimit); + return __tgt_target_kernel(Loc, DeviceId, NumTeams, ThreadLimit, HostPtr, + Args); } // Get the current number of components for a user-defined mapper.