diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -333,6 +333,7 @@ void __tgt_set_info_flag(uint32_t); +bool __tgt_print_device_info(int64_t device_id); #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -142,6 +142,9 @@ // Set plugin's internal information flag externally. void __tgt_rtl_set_info_flag(uint32_t); +// Print the device information +void __tgt_rtl_print_device_info(int32_t ID); + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h @@ -61,6 +61,11 @@ CUresult cuDeviceGetCount(int *); CUresult cuFuncGetAttribute(int *, CUfunction_attribute, CUfunction); +// Device info +CUresult cuDeviceGetName(char *, int, CUdevice *); +CUresult cuDeviceTotalMem(size_t *, CUdevice *); +CUresult cuDriverGetVersion(int *); + CUresult cuGetErrorString(CUresult, const char **); CUresult cuInit(unsigned); CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned, diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp @@ -28,6 +28,11 @@ DLWRAP(cuDeviceGetCount, 1); DLWRAP(cuFuncGetAttribute, 3); +// Device info +DLWRAP(cuDeviceGetName, 3); +DLWRAP(cuDeviceTotalMem, 2); +DLWRAP(cuDriverGetVersion, 1); + DLWRAP(cuGetErrorString, 2); DLWRAP(cuLaunchKernel, 11); diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1333,6 +1334,181 @@ InfoLevel.store(NewInfoLevel); } +#define BOOL2TEXT(b) ((b) ? "Yes" : "No") + +void __tgt_rtl_print_device_info(int32_t device_id) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + + char TmpChar[1000]; + size_t TmpSt; + int TmpInt, TmpInt2, TmpInt3; + + CUdevice Device; + checkResult(cuDeviceGet(&Device, device_id), + "Error returned from cuCtxGetDevice\n"); + + cuDriverGetVersion(&TmpInt); + printf(" CUDA Driver Version: \t\t%d \n", TmpInt); + printf(" CUDA Device Number: \t\t%d \n", device_id); + checkResult(cuDeviceGetName(TmpChar, 1000, Device), + "Error returned from cuDeviceGetName\n"); + printf(" Device Name: \t\t\t%s \n", TmpChar); + checkResult(cuDeviceTotalMem(&TmpSt, Device), + "Error returned from cuDeviceTotalMem\n"); + printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); + checkResult( + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Registers per Block: \t\t%d \n", TmpInt); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); + checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + Device), + "Error returned from cuDeviceGetAttribute\n"); + checkResult(cuDeviceGetAttribute(&TmpInt2, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), + "Error returned from cuDeviceGetAttribute\n"); + checkResult(cuDeviceGetAttribute(&TmpInt3, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, + TmpInt3); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), + "Error returned from cuDeviceGetAttribute\n"); + checkResult(cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, + Device), + "Error returned from cuDeviceGetAttribute\n"); + checkResult(cuDeviceGetAttribute(&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, + Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, + TmpInt3); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), + "Error returned from cuDeviceGetAttribute\n"); + if (TmpInt == CU_COMPUTEMODE_DEFAULT) + strcpy(TmpChar, "DEFAULT"); + else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) + strcpy(TmpChar, "PROHIBITED"); + else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) + strcpy(TmpChar, "EXCLUSIVE PROCESS"); + else + strcpy(TmpChar, "unknown"); + printf(" Compute Mode: \t\t\t%s \n", TmpChar); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); + checkResult( + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); + checkResult( + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); + checkResult( + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); + checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, + Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); + checkResult(cuDeviceGetAttribute(&TmpInt, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + Device), + "Error returned from cuDeviceGetAttribute\n"); + checkResult(cuDeviceGetAttribute(&TmpInt2, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); +} + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports --- a/openmp/libomptarget/plugins/exports +++ b/openmp/libomptarget/plugins/exports @@ -23,6 +23,7 @@ __tgt_rtl_unregister_lib; __tgt_rtl_supports_empty_images; __tgt_rtl_set_info_flag; + __tgt_rtl_print_device_info; local: *; }; diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -252,6 +252,8 @@ /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. int32_t synchronize(AsyncInfoTy &AsyncInfo); + bool printDeviceInfo(int32_t RTLDevID); + private: // Call to RTL void init(); // To be called only via DeviceTy::initOnce() diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -487,6 +487,14 @@ TgtOffsets, TgtVarsSize, AsyncInfo); } +// Run region on device +bool DeviceTy::printDeviceInfo(int32_t RTLDevId) { + if (!RTL->print_device_info) + return false; + RTL->print_device_info(RTLDevId); + return true; +} + // Run team region on device. int32_t DeviceTy::runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -40,6 +40,7 @@ llvm_omp_target_alloc_shared; llvm_omp_target_alloc_device; __tgt_set_info_flag; + __tgt_print_device_info; local: *; }; diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -466,3 +466,8 @@ R.set_info_flag(NewInfoLevel); } } + +EXTERN bool __tgt_print_device_info(int64_t device_id) { + return PM->Devices[device_id].printDeviceInfo( + PM->Devices[device_id].RTLDeviceID); +} diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -55,6 +55,7 @@ typedef int64_t(synchronize_ty)(int32_t, __tgt_async_info *); typedef int32_t (*register_lib_ty)(__tgt_bin_desc *); typedef int32_t(supports_empty_images_ty)(); + typedef void(print_device_info_ty)(int32_t); typedef void(set_info_flag_ty)(uint32_t); int32_t Idx = -1; // RTL index, index is the number of devices @@ -93,6 +94,7 @@ register_lib_ty unregister_lib = nullptr; supports_empty_images_ty *supports_empty_images = nullptr; set_info_flag_ty *set_info_flag = nullptr; + print_device_info_ty *print_device_info = nullptr; // Are there images associated with this RTL. bool isUsed = false; diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -177,6 +177,8 @@ dlsym(dynlib_handle, "__tgt_rtl_supports_empty_images"); *((void **)&R.set_info_flag) = dlsym(dynlib_handle, "__tgt_rtl_set_info_flag"); + *((void **)&R.print_device_info) = + dlsym(dynlib_handle, "__tgt_rtl_print_device_info"); } DP("RTLs loaded!\n");