diff --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt --- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt +++ b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt @@ -67,7 +67,6 @@ impl/data.cpp impl/get_elf_mach_gfx_name.cpp impl/system.cpp - impl/utils.cpp impl/msgpack.cpp src/rtl.cpp ${LIBOMPTARGET_EXTRA_SOURCE} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp @@ -17,23 +17,21 @@ using core::TaskImpl; namespace core { - -hsa_status_t Runtime::HostMalloc(void **ptr, size_t size, - hsa_amd_memory_pool_t MemoryPool) { +namespace Runtime { +hsa_status_t HostMalloc(void **ptr, size_t size, + hsa_amd_memory_pool_t MemoryPool) { hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, ptr); - DEBUG_PRINT("Malloced %p\n", *ptr); - + DP("Malloced %p\n", *ptr); if (err == HSA_STATUS_SUCCESS) { err = core::allow_access_to_all_gpu_agents(*ptr); } - return (err == HSA_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + return err; } -hsa_status_t Runtime::Memfree(void *ptr) { +hsa_status_t Memfree(void *ptr) { hsa_status_t err = hsa_amd_memory_pool_free(ptr); - DEBUG_PRINT("Freed %p\n", ptr); - - return (err == HSA_STATUS_SUCCESS) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + DP("Freed %p\n", ptr); + return err; } - +} // namespace Runtime } // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp @@ -5,8 +5,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "impl_runtime.h" #include "hsa_api.h" +#include "impl_runtime.h" #include "internal.h" #include "rt.h" #include @@ -32,7 +32,7 @@ hsa_signal_value_t got = init; while (got == init) { got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init, - UINT64_MAX, ATMI_WAIT_STATE); + UINT64_MAX, HSA_WAIT_STATE_BLOCKED); } if (got != success) { @@ -64,8 +64,7 @@ void *tempHostPtr; hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool); if (ret != HSA_STATUS_SUCCESS) { - DEBUG_PRINT("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", - size); + DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size); return ret; } std::unique_ptr del(tempHostPtr); @@ -94,8 +93,7 @@ void *tempHostPtr; hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool); if (ret != HSA_STATUS_SUCCESS) { - DEBUG_PRINT("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", - size); + DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size); return ret; } std::unique_ptr del(tempHostPtr); diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -24,7 +24,12 @@ #include "hsa_api.h" #include "impl_runtime.h" -#include "rt.h" + +#ifndef TARGET_NAME +#error "Missing TARGET_NAME macro" +#endif +#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" +#include "Debug.h" #define MAX_NUM_KERNELS (1024 * 16) @@ -41,34 +46,6 @@ unsigned long kernarg_template_ptr; } impl_implicit_args_t; -extern "C" { - -#ifdef DEBUG -#define DEBUG_PRINT(fmt, ...) \ - if (core::Runtime::getInstance().getDebugMode()) { \ - fprintf(stderr, "[%s:%d] " fmt, __FILE__, __LINE__, ##__VA_ARGS__); \ - } -#else -#define DEBUG_PRINT(...) \ - do { \ - } while (false) -#endif - -#ifndef HSA_RUNTIME_INC_HSA_H_ -typedef struct hsa_signal_s { - uint64_t handle; -} hsa_signal_t; -#endif - -} - -/* --------------------------------------------------------------------------------- - * Simulated CPU Data Structures and API - * --------------------------------------------------------------------------------- - */ - -#define ATMI_WAIT_STATE HSA_WAIT_STATE_BLOCKED - // ---------------------- Kernel Start ------------- typedef struct atl_kernel_info_s { uint64_t kernel_object; @@ -110,7 +87,7 @@ state.pop(); hsa_status_t rc = hsa_signal_destroy(signal); if (rc != HSA_STATUS_SUCCESS) { - DEBUG_PRINT("Signal pool destruction failed\n"); + DP("Signal pool destruction failed\n"); } } } @@ -183,6 +160,10 @@ hsa_status_t allow_access_to_all_gpu_agents(void *ptr); } // namespace core -const char *get_error_string(hsa_status_t err); +inline const char *get_error_string(hsa_status_t err) { + const char *res; + hsa_status_t rc = hsa_status_string(err, &res); + return (rc == HSA_STATUS_SUCCESS) ? res : "HSA_STATUS UNKNOWN."; +} #endif // SRC_RUNTIME_INCLUDE_INTERNAL_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h --- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/rt.h @@ -8,74 +8,26 @@ #ifndef SRC_RUNTIME_INCLUDE_RT_H_ #define SRC_RUNTIME_INCLUDE_RT_H_ -#include "impl_runtime.h" #include "hsa_api.h" +#include "impl_runtime.h" +#include "internal.h" + #include namespace core { - -#define DEFAULT_MAX_QUEUE_SIZE 4096 -#define DEFAULT_DEBUG_MODE 0 -class Environment { -public: - Environment() - : max_queue_size_(DEFAULT_MAX_QUEUE_SIZE), - debug_mode_(DEFAULT_DEBUG_MODE) { - GetEnvAll(); - } - - void GetEnvAll(); - - int getMaxQueueSize() const { return max_queue_size_; } - int getDebugMode() const { return debug_mode_; } - -private: - std::string GetEnv(const char *name) { - char *env = getenv(name); - std::string ret; - if (env) { - ret = env; - } - return ret; - } - - int max_queue_size_; - int debug_mode_; -}; - -class Runtime final { -public: - static Runtime &getInstance() { - static Runtime instance; - return instance; - } - - // modules - static hsa_status_t RegisterModuleFromMemory( - void *, size_t, hsa_agent_t agent, - hsa_status_t (*on_deserialized_data)(void *data, size_t size, - void *cb_state), - void *cb_state, std::vector &HSAExecutables); - - // data - static hsa_status_t Memcpy(hsa_signal_t, void *, const void *, size_t); - static hsa_status_t Memfree(void *); - static hsa_status_t HostMalloc(void **ptr, size_t size, - hsa_amd_memory_pool_t MemoryPool); - - int getMaxQueueSize() const { return env_.getMaxQueueSize(); } - int getDebugMode() const { return env_.getDebugMode(); } - -protected: - Runtime() = default; - ~Runtime() = default; - Runtime(const Runtime &) = delete; - Runtime &operator=(const Runtime &) = delete; - -protected: - // variable to track environment variables - Environment env_; -}; +namespace Runtime { +hsa_status_t Memfree(void *); +hsa_status_t HostMalloc(void **ptr, size_t size, + hsa_amd_memory_pool_t MemoryPool); + +} // namespace Runtime +hsa_status_t RegisterModuleFromMemory( + std::map &KernelInfoTable, + std::map &SymbolInfoTable, + void *module_bytes, size_t module_size, hsa_agent_t agent, + hsa_status_t (*on_deserialized_data)(void *data, size_t size, + void *cb_state), + void *cb_state, std::vector &HSAExecutables); } // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -525,8 +525,8 @@ size_t padding = new_offset - offset; offset = new_offset; info.arg_offsets.push_back(lcArg.offset_); - DEBUG_PRINT("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), - lcArg.size_, lcArg.offset_); + DP("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), lcArg.size_, + lcArg.offset_); offset += lcArg.size_; // check if the arg is a hidden/implicit arg @@ -541,13 +541,13 @@ } // add size of implicit args, e.g.: offset x, y and z and pipe pointer, but - // in ATMI, do not count the compiler set implicit args, but set your own - // implicit args by discounting the compiler set implicit args + // do not count the compiler set implicit args, but set your own implicit + // args by discounting the compiler set implicit args info.kernel_segment_size = (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size) + sizeof(impl_implicit_args_t); - DEBUG_PRINT("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(), - kernel_segment_size, info.kernel_segment_size); + DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(), + kernel_segment_size, info.kernel_segment_size); // kernel received, now add it to the kernel info table KernelInfoTable[kernelName] = info; @@ -571,7 +571,7 @@ "Symbol info extraction", get_error_string(err)); return err; } - DEBUG_PRINT("Exec Symbol type: %d\n", type); + DP("Exec Symbol type: %d\n", type); if (type == HSA_SYMBOL_KIND_KERNEL) { err = hsa_executable_symbol_get_info( symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length); @@ -636,11 +636,10 @@ return err; } - DEBUG_PRINT( - "Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes " - "kernarg\n", - kernelName.c_str(), info.kernel_object, info.group_segment_size, - info.private_segment_size, info.kernel_segment_size); + DP("Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes " + "kernarg\n", + kernelName.c_str(), info.kernel_object, info.group_segment_size, + info.private_segment_size, info.kernel_segment_size); // assign it back to the kernel info table KernelInfoTable[kernelName] = info; @@ -681,12 +680,11 @@ return err; } - DEBUG_PRINT("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr, - info.size); + DP("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr, info.size); SymbolInfoTable[std::string(name)] = info; free(name); } else { - DEBUG_PRINT("Symbol is an indirect function\n"); + DP("Symbol is an indirect function\n"); } return HSA_STATUS_SUCCESS; } @@ -730,9 +728,8 @@ err = get_code_object_custom_metadata(module_bytes, module_size, KernelInfoTable); if (err != HSA_STATUS_SUCCESS) { - DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Getting custom code object metadata", - get_error_string(err)); + DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Getting custom code object metadata", get_error_string(err)); continue; } @@ -741,8 +738,8 @@ err = hsa_code_object_deserialize(module_bytes, module_size, NULL, &code_object); if (err != HSA_STATUS_SUCCESS) { - DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Code Object Deserialization", get_error_string(err)); + DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Code Object Deserialization", get_error_string(err)); continue; } assert(0 != code_object.handle); @@ -763,8 +760,8 @@ err = hsa_executable_load_code_object(executable, agent, code_object, NULL); if (err != HSA_STATUS_SUCCESS) { - DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Loading the code object", get_error_string(err)); + DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Loading the code object", get_error_string(err)); continue; } @@ -772,7 +769,7 @@ } module_load_success = true; } while (0); - DEBUG_PRINT("Modules loaded successful? %d\n", module_load_success); + DP("Modules loaded successful? %d\n", module_load_success); if (module_load_success) { /* Freeze the executable; it can now be queried for symbols. */ err = hsa_executable_freeze(executable, ""); diff --git a/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp b/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp deleted file mode 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp +++ /dev/null @@ -1,39 +0,0 @@ -//===--- amdgpu/impl/utils.cpp ------------------------------------ C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include "internal.h" -#include "rt.h" - -#include -#include - -const char *get_error_string(hsa_status_t err) { - const char *res; - hsa_status_t rc = hsa_status_string(err, &res); - return (rc == HSA_STATUS_SUCCESS) ? res : "HSA_STATUS UNKNOWN."; -} - -namespace core { -/* - * Environment variables - */ -void Environment::GetEnvAll() { - std::string var = GetEnv("ATMI_HELP"); - if (!var.empty()) { - printf("ATMI_MAX_HSA_QUEUE_SIZE : positive integer\n" - "ATMI_DEBUG : 1 for printing out trace/debug info\n"); - } - - var = GetEnv("ATMI_MAX_HSA_QUEUE_SIZE"); - if (!var.empty()) - max_queue_size_ = std::stoi(var); - - var = GetEnv("ATMI_DEBUG"); - if (!var.empty()) - debug_mode_ = std::stoi(var); -} -} // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -28,18 +28,14 @@ #include "impl_runtime.h" #include "internal.h" +#include "rt.h" -#include "Debug.h" #include "get_elf_mach_gfx_name.h" #include "omptargetplugin.h" #include "print_tracing.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" -#ifndef TARGET_NAME -#error "Missing TARGET_NAME macro" -#endif -#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" // hostrpc interface, FIXME: consider moving to its own include these are // statically linked into amdgpu/plugin if present from hostrpc_services.a, @@ -84,16 +80,6 @@ #include "elf_common.h" -namespace core { -hsa_status_t RegisterModuleFromMemory( - std::map &KernelInfo, - std::map &SymbolInfoTable, void *, size_t, - hsa_agent_t agent, - hsa_status_t (*on_deserialized_data)(void *data, size_t size, - void *cb_state), - void *cb_state, std::vector &HSAExecutables); -} - namespace hsa { template hsa_status_t iterate_agents(C cb) { auto L = [](hsa_agent_t agent, void *data) -> hsa_status_t { @@ -713,7 +699,7 @@ return; } - // Init hostcall soon after initializing ATMI + // Init hostcall soon after initializing hsa hostrpc_init(); err = FindAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) { @@ -775,8 +761,9 @@ DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", i); return; } - if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) { - queue_size = core::Runtime::getInstance().getMaxQueueSize(); + enum { MaxQueueSize = 4096 }; + if (queue_size > MaxQueueSize) { + queue_size = MaxQueueSize; } } @@ -819,7 +806,7 @@ // impl_finalize removes access to it deviceStateStore.clear(); KernelArgPoolMap.clear(); - // Terminate hostrpc before finalizing ATMI + // Terminate hostrpc before finalizing hsa hostrpc_terminate(); hsa_status_t Err; @@ -1530,7 +1517,7 @@ } } - DP("ATMI module successfully loaded!\n"); + DP("AMDGPU module successfully loaded!\n"); { // the device_State array is either large value in bss or a void* that @@ -2197,8 +2184,7 @@ memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *)); } - // Initialize implicit arguments. ATMI seems to leave most fields - // uninitialized + // Initialize implicit arguments. TODO: Which of these can be dropped impl_implicit_args_t *impl_args = reinterpret_cast( static_cast(kernarg) + ArgPool->kernarg_segment_size);