diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -189,8 +189,11 @@ /// Getter of the HSA memory pool. hsa_amd_memory_pool_t get() const { return MemoryPool; } - /// Indicate if it belongs to the global segment. + /// Indicate the segment which belongs to. bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); } + bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); } + bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); } + bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); } /// Indicate if it is fine-grained memory. Valid only for global. bool isFineGrained() const { @@ -246,7 +249,6 @@ return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s"); } -private: /// Get attribute from the memory pool. template Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { @@ -255,6 +257,11 @@ return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s"); } + template + hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { + return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value); + } + /// Get attribute from the memory pool relating to an agent. template Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind, @@ -266,6 +273,7 @@ "Error in hsa_amd_agent_memory_pool_get_info: %s"); } +private: /// The HSA memory pool. hsa_amd_memory_pool_t MemoryPool; @@ -2100,8 +2108,206 @@ } /// Print information about the device. - Error printInfoImpl() override { - // TODO: Implement the basic info. + Error obtainInfoImpl(InfoQueueTy &Info) override { + char TmpChar[1000]; + const char *TmpCharPtr; + uint16_t Major, Minor; + uint32_t TmpUInt, TmpUInt2; + uint32_t CacheSize[4]; + size_t TmpSt; + bool TmpBool; + uint16_t WorkgrpMaxDim[3]; + hsa_dim3_t GridMaxDim; + hsa_status_t Status, Status2; + + Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major); + Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor); + if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS) + Info.add("HSA Runtime Version", + std::to_string(Major) + "." + std::to_string(Minor)); + + Info.add("HSA OpenMP Device Number", DeviceId); + + Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Product Name", TmpChar); + + Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Device Name", TmpChar); + + Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Vendor Name", TmpChar); + + hsa_device_type_t DevType; + Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType); + if (Status == HSA_STATUS_SUCCESS) { + switch (DevType) { + case HSA_DEVICE_TYPE_CPU: + TmpCharPtr = "CPU"; + break; + case HSA_DEVICE_TYPE_GPU: + TmpCharPtr = "GPU"; + break; + case HSA_DEVICE_TYPE_DSP: + TmpCharPtr = "DSP"; + break; + default: + TmpCharPtr = "Unknown"; + } + Info.add("Device Type", TmpCharPtr); + } + + Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Max Queues", TmpUInt); + + Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Queue Min Size", TmpUInt); + + Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Queue Max Size", TmpUInt); + + // FIXME: This is deprecated according to HSA documentation. But using + // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during + // runtime. + Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize); + if (Status == HSA_STATUS_SUCCESS) { + Info.add("Cache"); + + for (int I = 0; I < 4; I++) + if (CacheSize[I]) + Info.add("L" + std::to_string(I), CacheSize[I]); + } + + Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Cacheline Size", TmpUInt); + + Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Max Clock Freq", TmpUInt, "MHz"); + + Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Compute Units", TmpUInt); + + Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("SIMD per CU", TmpUInt); + + Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Fast F16 Operation", TmpBool); + + Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Wavefront Size", TmpUInt2); + + Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Workgroup Max Size", TmpUInt); + + Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim); + if (Status == HSA_STATUS_SUCCESS) { + Info.add("Workgroup Max Size per Dimension"); + Info.add("x", WorkgrpMaxDim[0]); + Info.add("y", WorkgrpMaxDim[1]); + Info.add("z", WorkgrpMaxDim[2]); + } + + Status = getDeviceAttrRaw( + (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt); + if (Status == HSA_STATUS_SUCCESS) { + Info.add("Max Waves Per CU", TmpUInt); + Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2); + } + + Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Grid Max Size", TmpUInt); + + Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim); + if (Status == HSA_STATUS_SUCCESS) { + Info.add("Grid Max Size per Dimension"); + Info.add("x", GridMaxDim.x); + Info.add("y", GridMaxDim.y); + Info.add("z", GridMaxDim.z); + } + + Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Max fbarriers/Workgrp", TmpUInt); + + Info.add("Memory Pools"); + for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { + std::string TmpStr, TmpStr2; + + if (Pool->isGlobal()) + TmpStr = "Global"; + else if (Pool->isReadOnly()) + TmpStr = "ReadOnly"; + else if (Pool->isPrivate()) + TmpStr = "Private"; + else if (Pool->isGroup()) + TmpStr = "Group"; + else + TmpStr = "Unknown"; + + Info.add(std::string("Pool ") + TmpStr); + + if (Pool->isGlobal()) { + if (Pool->isFineGrained()) + TmpStr2 += "Fine Grained "; + if (Pool->isCoarseGrained()) + TmpStr2 += "Coarse Grained "; + if (Pool->supportsKernelArgs()) + TmpStr2 += "Kernarg "; + + Info.add("Flags", TmpStr2); + } + + Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Size", TmpSt, "bytes"); + + Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, + TmpBool); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Allocatable", TmpBool); + + Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, + TmpSt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Runtime Alloc Granule", TmpSt, "bytes"); + + Status = Pool->getAttrRaw( + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Runtime Alloc Alignment", TmpSt, "bytes"); + + Status = + Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Accessable by all", TmpBool); + } + + Info.add("ISAs"); + auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) { + Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar); + if (Status == HSA_STATUS_SUCCESS) + Info.add("Name", TmpChar); + + return Status; + }); + + // Silently consume the error. + if (Err) + consumeError(std::move(Err)); + return Plugin::success(); } @@ -2126,6 +2332,11 @@ return Plugin::check(Status, "Error in hsa_agent_get_info: %s"); } + template + hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) { + return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value); + } + /// Get the device agent. hsa_agent_t getAgent() const override { return Agent; } diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -33,6 +34,7 @@ #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MemoryBufferRef.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/Triple.h" namespace llvm { @@ -84,6 +86,76 @@ __tgt_async_info *AsyncInfoPtr; }; +/// The information level represents the level of a key-value property in the +/// info tree print (i.e. indentation). The first level should be the default. +enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 }; + +/// Class for storing device information and later be printed. An object of this +/// type acts as a queue of key-value properties. Each property has a key, a +/// a value, and an optional unit for the value. For printing purposes, the +/// information can be classified into several levels. These levels are useful +/// for defining sections and subsections. Thus, each key-value property also +/// has an additional field indicating to which level belongs to. Notice that +/// we use the level to determine the indentation of the key-value property at +/// printing time. See the enum InfoLevelKind for the list of accepted levels. +class InfoQueueTy { + struct InfoQueueEntryTy { + std::string Key; + std::string Value; + std::string Units; + uint64_t Level; + }; + + std::deque Queue; + +public: + /// Add a new info entry to the queue. The entry requires at least a key + /// string in \p Key. The value in \p Value is optional and can be any type + /// that is representable as a string. The units in \p Units is optional and + /// must be a string. The info level is a template parameter that defaults to + /// the first level (top level). + template + void add(const std::string &Key, T Value = T(), + const std::string &Units = std::string()) { + assert(!Key.empty() && "Invalid info key"); + + // Convert the value to a string depending on its type. + if constexpr (std::is_same_v) + Queue.push_back({Key, Value ? "Yes" : "No", Units, L}); + else if constexpr (std::is_arithmetic_v) + Queue.push_back({Key, std::to_string(Value), Units, L}); + else + Queue.push_back({Key, Value, Units, L}); + } + + /// Print all info entries added to the queue. + void print() const { + // We print four spances for each level. + constexpr uint64_t IndentSize = 4; + + // Find the maximum key length (level + key) to compute the individual + // indentation of each entry. + uint64_t MaxKeySize = 0; + for (const auto &Entry : Queue) { + uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize; + if (KeySize > MaxKeySize) + MaxKeySize = KeySize; + } + + // Print all info entries. + for (const auto &Entry : Queue) { + // Compute the indentations for the current entry. + uint64_t KeyIndentSize = Entry.Level * IndentSize; + uint64_t ValIndentSize = + MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize; + + llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key + << std::string(ValIndentSize, ' ') << Entry.Value + << (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n"; + } + } +}; + /// Class wrapping a __tgt_device_image and its offload entry table on a /// specific device. This class is responsible for storing and managing /// the offload entries for an image on a device. @@ -645,7 +717,7 @@ /// Print information about the device. Error printInfo(); - virtual Error printInfoImpl() = 0; + virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0; /// Getters of the grid values. uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; } diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -995,8 +995,16 @@ } Error GenericDeviceTy::printInfo() { - // TODO: Print generic information here - return printInfoImpl(); + InfoQueueTy InfoQueue; + + // Get the vendor-specific info entries describing the device properties. + if (auto Err = obtainInfoImpl(InfoQueue)) + return Err; + + // Print all info entries. + InfoQueue.print(); + + return Plugin::success(); } Error GenericDeviceTy::createEvent(void **EventPtrStorage) { diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -620,147 +620,170 @@ } /// Print information about the device. - Error printInfoImpl() override { + Error obtainInfoImpl(InfoQueueTy &Info) override { char TmpChar[1000]; - std::string TmpStr; + const char *TmpCharPtr; size_t TmpSt; - int TmpInt, TmpInt2, TmpInt3; - - // TODO: All these calls should be checked, but the whole printInfo must be - // improved, so we will refactor it in the future. - cuDriverGetVersion(&TmpInt); - printf(" CUDA Driver Version: \t\t%d \n", TmpInt); - printf(" CUDA Device Number: \t\t%d \n", DeviceId); - - cuDeviceGetName(TmpChar, 1000, Device); - printf(" Device Name: \t\t\t%s \n", TmpChar); - - cuDeviceTotalMem(&TmpSt, Device); - printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, - Device); - printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device); - printf(" Concurrent Copy and Execution: \t%s \n", TmpInt ? "Yes" : "No"); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, - Device); - printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); - - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device); - printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, - Device), - printf(" Registers per Block: \t\t%d \n", TmpInt); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); - printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - Device); - printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); - cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device); - cuDeviceGetAttribute(&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device); - printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, - TmpInt3); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device); - cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device); - cuDeviceGetAttribute(&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device); - printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, - TmpInt3); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device); - printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, - Device); - printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device); - printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, - Device); - printf(" Execution Timeout: \t\t\t%s \n", TmpInt ? "Yes" : "No"); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device); - printf(" Integrated Device: \t\t\t%s \n", TmpInt ? "Yes" : "No"); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, - Device); - printf(" Can Map Host Memory: \t\t%s \n", TmpInt ? "Yes" : "No"); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device); - if (TmpInt == CU_COMPUTEMODE_DEFAULT) - TmpStr = "DEFAULT"; - else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) - TmpStr = "PROHIBITED"; - else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) - TmpStr = "EXCLUSIVE PROCESS"; - else - TmpStr = "unknown"; - printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, - Device); - printf(" Concurrent Kernels: \t\t%s \n", TmpInt ? "Yes" : "No"); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device); - printf(" ECC Enabled: \t\t\t%s \n", TmpInt ? "Yes" : "No"); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, - Device); - printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, - Device); - printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, Device); - printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); - - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, Device); - printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, - Device); - printf(" Async Engines: \t\t\t%s (%d) \n", TmpInt ? "Yes" : "No", - TmpInt); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, - Device); - printf(" Unified Addressing: \t\t%s \n", TmpInt ? "Yes" : "No"); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device); - printf(" Managed Memory: \t\t\t%s \n", TmpInt ? "Yes" : "No"); + int TmpInt; + + CUresult Res = cuDriverGetVersion(&TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("CUDA Driver Version", TmpInt); + + Info.add("CUDA OpenMP Device Number", DeviceId); + + Res = cuDeviceGetName(TmpChar, 1000, Device); + if (Res == CUDA_SUCCESS) + Info.add("Device Name", TmpChar); + + Res = cuDeviceTotalMem(&TmpSt, Device); + if (Res == CUDA_SUCCESS) + Info.add("Global Memory Size", TmpSt, "bytes"); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Number of Multiprocessors", TmpInt); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Concurrent Copy and Execution", (bool)TmpInt); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Total Constant Memory", TmpInt, "bytes"); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, + TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Max Shared Memory per Block", TmpInt, "bytes"); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Registers per Block", TmpInt); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_WARP_SIZE, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Warp Size", TmpInt); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Maximum Threads per Block", TmpInt); + + Info.add("Maximum Block Dimensions", ""); + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("x", TmpInt); + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("y", TmpInt); + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("z", TmpInt); + + Info.add("Maximum Grid Dimensions", ""); + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("x", TmpInt); + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("y", TmpInt); + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("z", TmpInt); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Maximum Memory Pitch", TmpInt, "bytes"); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Texture Alignment", TmpInt, "bytes"); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Clock Rate", TmpInt, "kHz"); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Execution Timeout", (bool)TmpInt); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_INTEGRATED, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Integrated Device", (bool)TmpInt); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Can Map Host Memory", (bool)TmpInt); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, TmpInt); + if (Res == CUDA_SUCCESS) { + if (TmpInt == CU_COMPUTEMODE_DEFAULT) + TmpCharPtr = "Default"; + else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) + TmpCharPtr = "Prohibited"; + else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) + TmpCharPtr = "Exclusive process"; + else + TmpCharPtr = "Unknown"; + Info.add("Compute Mode", TmpCharPtr); + } + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Concurrent Kernels", (bool)TmpInt); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ECC_ENABLED, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("ECC Enabled", (bool)TmpInt); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Memory Clock Rate", TmpInt, "kHz"); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Memory Bus Width", TmpInt, "bits"); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("L2 Cache Size", TmpInt, "bytes"); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, + TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Max Threads Per SMP", TmpInt); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Async Engines", TmpInt); - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, - Device); - printf(" Concurrent Managed Memory: \t\t%s \n", TmpInt ? "Yes" : "No"); - - cuDeviceGetAttribute( - &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device); - printf(" Preemption Supported: \t\t%s \n", TmpInt ? "Yes" : "No"); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, - Device); - printf(" Cooperative Launch: \t\t%s \n", TmpInt ? "Yes" : "No"); - - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device); - printf(" Multi-Device Boars: \t\t%s \n", TmpInt ? "Yes" : "No"); + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Unified Addressing", (bool)TmpInt); - cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - Device); - cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - Device); - printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Managed Memory", (bool)TmpInt); + + Res = + getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Concurrent Managed Memory", (bool)TmpInt); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, + TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Preemption Supported", (bool)TmpInt); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Cooperative Launch", (bool)TmpInt); + + Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt); + if (Res == CUDA_SUCCESS) + Info.add("Multi-Device Boars", (bool)TmpInt); + + Info.add("Compute Capabilities", ComputeCapability.str()); return Plugin::success(); } @@ -797,6 +820,10 @@ return Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"); } + CUresult getDeviceAttrRaw(uint32_t Kind, int &Value) { + return cuDeviceGetAttribute(&Value, (CUdevice_attribute)Kind, Device); + } + /// See GenericDeviceTy::getComputeUnitKind(). std::string getComputeUnitKind() const override { return ComputeCapability.str(); diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp @@ -294,8 +294,8 @@ Error syncEventImpl(void *EventPtr) override { return Plugin::success(); } /// Print information about the device. - Error printInfoImpl() override { - printf(" This is a generic-elf-64bit device\n"); + Error obtainInfoImpl(InfoQueueTy &Info) override { + Info.add("Device Type", "Generic-elf-64bit"); return Plugin::success(); }