diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp @@ -20,7 +20,6 @@ extern ATLMachine g_atl_machine; namespace core { -void allow_access_to_all_gpu_agents(void *ptr); namespace { ATLProcessor &get_processor_by_mem_place(atmi_mem_place_t place) { @@ -39,45 +38,33 @@ } } // namespace -void register_allocation(void *ptr, size_t size, atmi_mem_place_t place) { +hsa_status_t register_allocation(void *ptr, size_t size, + atmi_mem_place_t place) { if (place.dev_type == ATMI_DEVTYPE_CPU) - allow_access_to_all_gpu_agents(ptr); + return allow_access_to_all_gpu_agents(ptr); + else + return HSA_STATUS_SUCCESS; } atmi_status_t Runtime::Malloc(void **ptr, size_t size, atmi_mem_place_t place) { - atmi_status_t ret = ATMI_STATUS_SUCCESS; hsa_amd_memory_pool_t pool = get_memory_pool_by_mem_place(place); hsa_status_t err = hsa_amd_memory_pool_allocate(pool, size, 0, ptr); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "atmi_malloc", - get_error_string(err)); - exit(1); - } DEBUG_PRINT("Malloced [%s %d] %p\n", place.dev_type == ATMI_DEVTYPE_CPU ? "CPU" : "GPU", place.dev_id, *ptr); - if (err != HSA_STATUS_SUCCESS) - ret = ATMI_STATUS_ERROR; - register_allocation(*ptr, size, place); + if (err == HSA_STATUS_SUCCESS) { + err = register_allocation(*ptr, size, place); + } - return ret; + return (err == HSA_STATUS_SUCCESS) ? ATMI_STATUS_SUCCESS : ATMI_STATUS_ERROR; } atmi_status_t Runtime::Memfree(void *ptr) { - atmi_status_t ret = ATMI_STATUS_SUCCESS; - hsa_status_t err; - err = hsa_amd_memory_pool_free(ptr); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "atmi_free", - get_error_string(err)); - exit(1); - } + hsa_status_t err = hsa_amd_memory_pool_free(ptr); DEBUG_PRINT("Freed %p\n", ptr); - if (err != HSA_STATUS_SUCCESS) - ret = ATMI_STATUS_ERROR; - return ret; + return (err == HSA_STATUS_SUCCESS) ? ATMI_STATUS_SUCCESS : ATMI_STATUS_ERROR; } } // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -48,12 +48,6 @@ extern "C" { #endif -#define check(msg, status) \ - if (status != HSA_STATUS_SUCCESS) { \ - printf("%s failed.\n", #msg); \ - exit(1); \ - } - #ifdef DEBUG #define DEBUG_PRINT(fmt, ...) \ if (core::Runtime::getInstance().getDebugMode()) { \ @@ -220,15 +214,14 @@ alignDown((intptr_t)(value + alignment - 1), alignment)); } -extern void register_allocation(void *addr, size_t size, - atmi_mem_place_t place); +hsa_status_t register_allocation(void *addr, size_t size, + atmi_mem_place_t place); extern bool atl_is_atmi_initialized(); bool handle_group_signal(hsa_signal_value_t value, void *arg); - -void allow_access_to_all_gpu_agents(void *ptr); +hsa_status_t allow_access_to_all_gpu_agents(void *ptr); } // namespace core const char *get_error_string(hsa_status_t err); diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -177,20 +177,14 @@ bool atl_is_atmi_initialized() { return g_atmi_initialized; } -void allow_access_to_all_gpu_agents(void *ptr) { - hsa_status_t err; +hsa_status_t allow_access_to_all_gpu_agents(void *ptr) { std::vector &gpu_procs = g_atl_machine.processors(); std::vector agents; for (uint32_t i = 0; i < gpu_procs.size(); i++) { agents.push_back(gpu_procs[i].agent()); } - err = hsa_amd_agents_allow_access(agents.size(), &agents[0], NULL, ptr); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Allow agents ptr access", get_error_string(err)); - exit(1); - } + return hsa_amd_agents_allow_access(agents.size(), &agents[0], NULL, ptr); } atmi_status_t Runtime::Initialize() { @@ -199,10 +193,11 @@ return ATMI_STATUS_SUCCESS; if (devtype == ATMI_DEVTYPE_ALL || devtype & ATMI_DEVTYPE_GPU) { - if (atl_init_gpu_context() != ATMI_STATUS_SUCCESS) { + atmi_status_t rc = atl_init_gpu_context(); + if (rc != ATMI_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "GPU context init", get_atmi_error_string(atl_init_gpu_context())); - exit(1); + return rc; } } @@ -211,14 +206,14 @@ } atmi_status_t Runtime::Finalize() { - hsa_status_t err; + atmi_status_t rc = ATMI_STATUS_SUCCESS; for (uint32_t i = 0; i < g_executables.size(); i++) { - err = hsa_executable_destroy(g_executables[i]); + hsa_status_t err = hsa_executable_destroy(g_executables[i]); if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Destroying executable", get_error_string(err)); - exit(1); + rc = ATMI_STATUS_ERROR; } } @@ -232,14 +227,14 @@ KernelInfoTable.clear(); atl_reset_atmi_initialized(); - err = hsa_shut_down(); + hsa_status_t err = hsa_shut_down(); if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Shutting down HSA", get_error_string(err)); - exit(1); + rc = ATMI_STATUS_ERROR; } - return ATMI_STATUS_SUCCESS; + return rc; } static void atmi_init_context_structs() { @@ -264,7 +259,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Alloc allowed in memory pool check", get_error_string(err)); - exit(1); + return err; } if (alloc_allowed) { uint32_t global_flag = 0; @@ -273,7 +268,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Get memory pool info", get_error_string(err)); - exit(1); + return err; } if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) { ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED); @@ -298,29 +293,27 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Get device type info", get_error_string(err)); - exit(1); + return err; } switch (device_type) { case HSA_DEVICE_TYPE_CPU: { - ; ATLCPUProcessor new_proc(agent); err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info, &new_proc); if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Iterate all memory pools", get_error_string(err)); - exit(1); + return err; } g_atl_machine.addProcessor(new_proc); } break; case HSA_DEVICE_TYPE_GPU: { - ; hsa_profile_t profile; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &profile); if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Query the agent profile", get_error_string(err)); - exit(1); + return err; } atmi_devtype_t gpu_type; gpu_type = @@ -331,7 +324,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Iterate all memory pools", get_error_string(err)); - exit(1); + return err; } g_atl_machine.addProcessor(new_proc); } break; @@ -389,10 +382,8 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Getting a gpu agent", get_error_string(err)); - exit(1); - } - if (err != HSA_STATUS_SUCCESS) return err; + } /* Init all devices or individual device types? */ std::vector &cpu_procs = @@ -530,7 +521,7 @@ printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Finding a CPU kernarg memory region handle", get_error_string(err)); - exit(1); + return err; } } /* Find a memory region that supports kernel arguments. */ @@ -543,7 +534,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Finding a kernarg memory region", get_error_string(err)); - exit(1); + return err; } } if (num_procs > 0) @@ -559,7 +550,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Initializing the hsa runtime", get_error_string(err)); - exit(1); + return err; } if (err != HSA_STATUS_SUCCESS) return err; @@ -570,7 +561,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "After initializing compute and memory", get_error_string(err)); - exit(1); + return err; } int gpu_count = g_atl_machine.processorCount(); @@ -659,7 +650,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Registering the system for memory faults", get_error_string(err)); - exit(1); + return ATMI_STATUS_ERROR; } init_tasks(); @@ -1042,7 +1033,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Symbol info extraction", get_error_string(err)); - exit(1); + return err; } DEBUG_PRINT("Exec Symbol type: %d\n", type); if (type == HSA_SYMBOL_KIND_KERNEL) { @@ -1051,7 +1042,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Symbol info extraction", get_error_string(err)); - exit(1); + return err; } char *name = reinterpret_cast(malloc(name_length + 1)); err = hsa_executable_symbol_get_info(symbol, @@ -1059,7 +1050,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Symbol info extraction", get_error_string(err)); - exit(1); + return err; } name[name_length] = 0; @@ -1067,12 +1058,7 @@ // did not find kernel name in the kernel map; this can happen only // if the ROCr API for getting symbol info (name) is different from // the comgr method of getting symbol info - if (HSA_STATUS_ERROR_INVALID_CODE_OBJECT != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Invalid kernel name", - get_error_string(HSA_STATUS_ERROR_INVALID_CODE_OBJECT)); - exit(1); - } + return HSA_STATUS_ERROR; } atl_kernel_info_t info; std::string kernelName = KernelNameMap[std::string(name)]; @@ -1080,12 +1066,7 @@ // because the non-ROCr custom code object parsing is called before // iterating over the code object symbols using ROCr if (KernelInfoTable[gpu].find(kernelName) == KernelInfoTable[gpu].end()) { - if (HSA_STATUS_ERROR_INVALID_CODE_OBJECT != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Finding the entry kernel info table", - get_error_string(HSA_STATUS_ERROR_INVALID_CODE_OBJECT)); - exit(1); - } + return HSA_STATUS_ERROR; } // found, so assign and update info = KernelInfoTable[gpu][kernelName]; @@ -1098,7 +1079,7 @@ printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Extracting the symbol from the executable", get_error_string(err)); - exit(1); + return err; } err = hsa_executable_symbol_get_info( symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, @@ -1107,7 +1088,7 @@ printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Extracting the group segment size from the executable", get_error_string(err)); - exit(1); + return err; } err = hsa_executable_symbol_get_info( symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, @@ -1116,7 +1097,7 @@ printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Extracting the private segment from the executable", get_error_string(err)); - exit(1); + return err; } DEBUG_PRINT( @@ -1134,7 +1115,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Symbol info extraction", get_error_string(err)); - exit(1); + return err; } char *name = reinterpret_cast(malloc(name_length + 1)); err = hsa_executable_symbol_get_info(symbol, @@ -1142,7 +1123,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Symbol info extraction", get_error_string(err)); - exit(1); + return err; } name[name_length] = 0; @@ -1153,7 +1134,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Symbol info address extraction", get_error_string(err)); - exit(1); + return err; } err = hsa_executable_symbol_get_info( @@ -1161,14 +1142,17 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Symbol info size extraction", get_error_string(err)); - exit(1); + return err; } atmi_mem_place_t place = ATMI_MEM_PLACE(ATMI_DEVTYPE_GPU, gpu, 0); DEBUG_PRINT("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr, - info.size); - register_allocation(reinterpret_cast(info.addr), (size_t)info.size, - place); + inofo.size); + err = register_allocation(reinterpret_cast(info.addr), + (size_t)info.size, place); + if (err != HSA_STATUS_SUCCESS) { + return err; + } SymbolInfoTable[gpu][std::string(name)] = info; if (strcmp(name, "needs_hostcall_buffer") == 0) g_atmi_hostcall_required = true; @@ -1198,7 +1182,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Query the agent profile", get_error_string(err)); - exit(1); + return ATMI_STATUS_ERROR; } // FIXME: Assume that every profile is FULL until we understand how to build // GCN with base profile @@ -1209,7 +1193,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Create the executable", get_error_string(err)); - exit(1); + return ATMI_STATUS_ERROR; } bool module_load_success = false; @@ -1247,7 +1231,7 @@ printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Error in deserialized_data callback", get_atmi_error_string(atmi_err)); - exit(1); + return atmi_err; } /* Load the code object. */ @@ -1270,7 +1254,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Freeze the executable", get_error_string(err)); - exit(1); + return ATMI_STATUS_ERROR; } err = hsa_executable_iterate_symbols(executable, populate_InfoTables, @@ -1278,7 +1262,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Iterating over symbols for execuatable", get_error_string(err)); - exit(1); + return ATMI_STATUS_ERROR; } // save the executable and destroy during finalize diff --git a/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp b/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp @@ -99,7 +99,6 @@ std::cout << "ATMI_MAX_HSA_QUEUE_SIZE : positive integer" << std::endl << "ATMI_DEBUG : 1 for printing out trace/debug info" << std::endl; - exit(0); } var = GetEnv("ATMI_MAX_HSA_QUEUE_SIZE"); diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -72,7 +72,6 @@ int print_kernel_trace; -#undef check // Drop definition from internal.h #ifdef OMPTARGET_DEBUG #define check(msg, status) \ if (status != ATMI_STATUS_SUCCESS) { \ @@ -116,11 +115,8 @@ ~KernelArgPool() { if (kernarg_region) { auto r = hsa_amd_memory_pool_free(kernarg_region); - assert(r == HSA_STATUS_SUCCESS); if (r != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Memory pool free", get_error_string(r)); - exit(1); + DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(r)); } } } @@ -141,15 +137,12 @@ kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0, &kernarg_region); if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Allocating memory for the executable-kernel", - get_error_string(err)); - exit(1); - } - core::allow_access_to_all_gpu_agents(kernarg_region); - - for (int i = 0; i < MAX_NUM_KERNELS; i++) { - free_kernarg_segments.push(i); + DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(err)); + } else { + core::allow_access_to_all_gpu_agents(kernarg_region); + for (int i = 0; i < MAX_NUM_KERNELS; i++) { + free_kernarg_segments.push(i); + } } } } @@ -472,17 +465,18 @@ NumThreads.resize(NumberOfDevices); deviceStateStore.resize(NumberOfDevices); + for (int i = 0; i < NumberOfDevices; i++) { + HSAQueues[i] = nullptr; + } + for (int i = 0; i < NumberOfDevices; i++) { uint32_t queue_size = 0; { - hsa_status_t err; - err = hsa_agent_get_info(HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE, - &queue_size); + hsa_status_t err = hsa_agent_get_info( + HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Querying the agent maximum queue size", - get_error_string(err)); - exit(1); + DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", i); + return; } if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) { queue_size = core::Runtime::getInstance().getMaxQueueSize(); @@ -493,7 +487,7 @@ HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI, callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &HSAQueues[i]); if (rc != HSA_STATUS_SUCCESS) { - DP("Failed to create HSA queues\n"); + DP("Failed to create HSA queue %d\n", i); return; } @@ -993,9 +987,8 @@ if (rc == 0) { if (size_si.size != sizeof(uint64_t)) { - fprintf(stderr, - "Found device_State_size variable with wrong size, aborting\n"); - exit(1); + DP("Found device_State_size variable with wrong size\n"); + return 0; } // Read number of bytes directly from the elf @@ -1808,6 +1801,9 @@ // Run on the device. { hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id]; + if (!queue) { + return OFFLOAD_FAIL; + } uint64_t packet_id = acquire_available_packet_id(queue); const uint32_t mask = queue->size - 1; // size is a power of 2 @@ -1849,8 +1845,8 @@ } } if (!ArgPool) { - fprintf(stderr, "Warning: No ArgPool for %s on device %d\n", - KernelInfo->Name, device_id); + DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, + device_id); } { void *kernarg = nullptr; @@ -1859,8 +1855,8 @@ kernarg = ArgPool->allocate(arg_num); } if (!kernarg) { - printf("Allocate kernarg failed\n"); - exit(1); + DP("Allocate kernarg failed\n"); + return OFFLOAD_FAIL; } // Copy explicit arguments @@ -1901,8 +1897,8 @@ { hsa_signal_t s = DeviceInfo.FreeSignalPool.pop(); if (s.handle == 0) { - printf("Failed to get signal instance\n"); - exit(1); + DP("Failed to get signal instance\n"); + return OFFLOAD_FAIL; } packet->completion_signal = s; hsa_signal_store_relaxed(packet->completion_signal, 1);