diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h --- a/openmp/libomptarget/include/device.h +++ b/openmp/libomptarget/include/device.h @@ -320,6 +320,9 @@ bool IsInit; std::once_flag InitFlag; bool HasPendingGlobals; + /// The physical number of team processors. For cuda, this is number of SMs, + /// for AMD, this is number of CUs. Field used by ompx_get_team_procs(devid). + int32_t TeamProcs; /// Host data to device map type with a wrapper key indirection that allows /// concurrent modification of the entries without invalidating the underlying @@ -462,6 +465,10 @@ /// Destroy the event. int32_t destroyEvent(void *Event); + + void setTeamProcs(int32_t num_team_procs) { TeamProcs = num_team_procs; } + int32_t getTeamProcs() { return TeamProcs; } + /// } private: diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -222,6 +222,7 @@ extern "C" { #endif +int ompx_get_team_procs(int device_num); int omp_get_num_devices(void); int omp_get_device_num(void); int omp_get_initial_device(void); diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -192,6 +192,10 @@ int32_t __tgt_rtl_init_device_info(int32_t ID, __tgt_device_info *DeviceInfoPtr, const char **ErrStr); +// Number of available physical processors to execute teams. AMD calls these +// CUs. Nvidia calls them SMs. For CPUs modeling teams, they could be sockets. +int32_t __tgt_rtl_number_of_team_procs(int32_t device_num); + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h --- a/openmp/libomptarget/include/rtl.h +++ b/openmp/libomptarget/include/rtl.h @@ -75,6 +75,7 @@ typedef int32_t(init_async_info_ty)(int32_t, __tgt_async_info **); typedef int64_t(init_device_into_ty)(int64_t, __tgt_device_info *, const char **); + typedef int32_t(number_of_team_procs_ty)(int32_t); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -124,6 +125,7 @@ destroy_event_ty *destroy_event = nullptr; init_async_info_ty *init_async_info = nullptr; init_device_into_ty *init_device_info = nullptr; + number_of_team_procs_ty *number_of_team_procs = nullptr; release_async_info_ty *release_async_info = nullptr; // Are there images associated with this RTL. diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -1945,6 +1945,10 @@ return elfMachineIdIsAmdgcn(Image); } +int __tgt_rtl_number_of_team_procs(int DeviceId) { + return DeviceInfo().ComputeUnits[DeviceId]; +} + int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image, __tgt_image_info *info) { if (!__tgt_rtl_is_valid_binary(image)) diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -353,6 +353,7 @@ std::vector DeviceData; std::vector> Modules; + std::vector NumberOfTeamProcs; /// Vector of flags indicating the initalization status of all associated /// devices. @@ -522,6 +523,7 @@ DeviceData.resize(NumberOfDevices); Modules.resize(NumberOfDevices); + NumberOfTeamProcs.resize(NumberOfDevices); StreamPool.resize(NumberOfDevices); EventPool.resize(NumberOfDevices); PeerAccessMatrix.resize(NumberOfDevices); @@ -585,6 +587,8 @@ int getNumOfDevices() const { return NumberOfDevices; } + int getNumOfTeamProcs(int devid) const { return NumberOfTeamProcs[devid]; } + void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } int initDevice(const int DeviceId) { @@ -653,6 +657,18 @@ DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; } + // Query attributes to for number of SMs for ompx_get_team_procs(devid) + int TmpTeamProcs; + Err = cuDeviceGetAttribute( + &TmpTeamProcs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device); + if (Err != CUDA_SUCCESS) { + DP("Error: on CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, using %d\n", 16); + NumberOfTeamProcs[DeviceId] = 16; + } else { + DP("Device %d has %d procs for team execution\n", DeviceId, TmpTeamProcs); + NumberOfTeamProcs[DeviceId] = TmpTeamProcs; + } + // We are only exploiting threads along the x axis. int MaxBlockDimX; Err = cuDeviceGetAttribute(&MaxBlockDimX, @@ -1574,6 +1590,10 @@ int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } +int32_t __tgt_rtl_number_of_team_procs(int devid) { + return DeviceRTL.getNumOfTeamProcs(devid); +} + int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { DP("Init requires flags to %" PRId64 "\n", RequiresFlags); DeviceRTL.setRequiresFlag(RequiresFlags); diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp --- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp @@ -41,6 +41,7 @@ #include "elf_common.h" #define NUMBER_OF_DEVICES 4 +#define NUMBER_OF_TEAM_PROCS 1 #define OFFLOAD_SECTION_NAME "omp_offloading_entries" /// Array of Dynamic libraries loaded for this target. @@ -128,6 +129,12 @@ int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; } +// __tgt_rtl_number_of_team_procs supports ompx_get_team_procs(devid). +// Imples that support multiple host teams may want to change this. +int32_t __tgt_rtl_number_of_team_procs(int32_t device_id) { + return NUMBER_OF_TEAM_PROCS; +} + int32_t __tgt_rtl_init_device(int32_t DeviceId) { return OFFLOAD_SUCCESS; } __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, diff --git a/openmp/libomptarget/plugins/ve/src/rtl.cpp b/openmp/libomptarget/plugins/ve/src/rtl.cpp --- a/openmp/libomptarget/plugins/ve/src/rtl.cpp +++ b/openmp/libomptarget/plugins/ve/src/rtl.cpp @@ -179,6 +179,10 @@ // target RTL. int32_t __tgt_rtl_number_of_devices(void) { return DeviceInfo.NodeIds.size(); } +int32_t __tgt_rtl_number_of_team_procs(int device_id) { + return DeviceInfo.ProcHandles[device_id].size(); +} + // Return an integer different from zero if the provided device image can be // supported by the runtime. The functionality is similar to comparing the // result of __tgt__rtl__load__binary to NULL. However, this is meant to be a diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -30,6 +30,20 @@ return DevicesSize; } +EXTERN int ompx_get_team_procs(int device_num) { + if (!deviceIsReady(device_num)) { + DP("Device %d did not initialize\n", device_num); + // return 1 team proc for initial/host device + return 1; + } + TIMESCOPE(); + PM->RTLsMtx.lock(); + int TeamProcs = PM->Devices[device_num]->getTeamProcs(); + PM->RTLsMtx.unlock(); + DP("Call to ompx_get_team_procs returning %d\n", TeamProcs); + return TeamProcs; +} + EXTERN int omp_get_device_num(void) { TIMESCOPE(); int HostDevice = omp_get_initial_device(); diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -496,6 +496,7 @@ int32_t Ret = RTL->init_device(RTLDeviceID); if (Ret != OFFLOAD_SUCCESS) return; + setTeamProcs(RTL->number_of_team_procs(RTLDeviceID)); IsInit = true; } diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -30,6 +30,7 @@ __tgt_push_mapper_component; __kmpc_push_target_tripcount; __kmpc_push_target_tripcount_mapper; + ompx_get_team_procs; omp_get_num_devices; omp_get_device_num; omp_get_initial_device; diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -127,6 +127,9 @@ if (!(*((void **)&R.number_of_devices) = DynLibrary->getAddressOfSymbol("__tgt_rtl_number_of_devices"))) ValidPlugin = false; + if (!(*((void **)&R.number_of_team_procs) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_number_of_team_procs"))) + ValidPlugin = false; if (!(*((void **)&R.init_device) = DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device"))) ValidPlugin = false; diff --git a/openmp/libomptarget/test/api/ompx_get_team_procs.c b/openmp/libomptarget/test/api/ompx_get_team_procs.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/api/ompx_get_team_procs.c @@ -0,0 +1,34 @@ +// RUN: %libomptarget-compile-run-and-check-generic + +#include +#include + +int test_omp_get_team_procs(int devid) { + /* checks that ompx_get_team_procs() > 0 */ + int team_procs = ompx_get_team_procs(devid); + printf("team_procs(%d) = %d\n", devid, team_procs); + +#pragma omp target + {} + + return (team_procs > 0); +} + +int main() { + int i; + int failed = 0; + + if (!test_omp_get_team_procs(omp_get_initial_device())) { + failed++; + } + if (!test_omp_get_team_procs(omp_get_default_device())) { + failed++; + } + if (failed) + printf("FAIL\n"); + else + printf("PASS\n"); + return failed; +} + +// CHECK: PASS diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var --- a/openmp/runtime/src/include/omp.h.var +++ b/openmp/runtime/src/include/omp.h.var @@ -498,6 +498,7 @@ /* LLVM Extensions */ extern void *llvm_omp_target_dynamic_shared_alloc(); + extern int __KAI_KMPC_CONVENTION ompx_get_team_procs(int); # undef __KAI_KMPC_CONVENTION # undef __KMP_IMP