diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp @@ -20,7 +20,6 @@ extern ATLMachine g_atl_machine; namespace core { -void allow_access_to_all_gpu_agents(void *ptr); namespace { ATLProcessor &get_processor_by_mem_place(atmi_mem_place_t place) { @@ -39,45 +38,33 @@ } } // namespace -void register_allocation(void *ptr, size_t size, atmi_mem_place_t place) { +hsa_status_t register_allocation(void *ptr, size_t size, + atmi_mem_place_t place) { if (place.dev_type == ATMI_DEVTYPE_CPU) - allow_access_to_all_gpu_agents(ptr); + return allow_access_to_all_gpu_agents(ptr); + else + return HSA_STATUS_SUCCESS; } atmi_status_t Runtime::Malloc(void **ptr, size_t size, atmi_mem_place_t place) { - atmi_status_t ret = ATMI_STATUS_SUCCESS; hsa_amd_memory_pool_t pool = get_memory_pool_by_mem_place(place); hsa_status_t err = hsa_amd_memory_pool_allocate(pool, size, 0, ptr); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "atmi_malloc", - get_error_string(err)); - exit(1); - } DEBUG_PRINT("Malloced [%s %d] %p\n", place.dev_type == ATMI_DEVTYPE_CPU ? "CPU" : "GPU", place.dev_id, *ptr); - if (err != HSA_STATUS_SUCCESS) - ret = ATMI_STATUS_ERROR; - register_allocation(*ptr, size, place); + if (err == HSA_STATUS_SUCCESS) { + err = register_allocation(*ptr, size, place); + } - return ret; + return (err == HSA_STATUS_SUCCESS) ? ATMI_STATUS_SUCCESS : ATMI_STATUS_ERROR; } atmi_status_t Runtime::Memfree(void *ptr) { - atmi_status_t ret = ATMI_STATUS_SUCCESS; - hsa_status_t err; - err = hsa_amd_memory_pool_free(ptr); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "atmi_free", - get_error_string(err)); - exit(1); - } + hsa_status_t err = hsa_amd_memory_pool_free(ptr); DEBUG_PRINT("Freed %p\n", ptr); - if (err != HSA_STATUS_SUCCESS) - ret = ATMI_STATUS_ERROR; - return ret; + return (err == HSA_STATUS_SUCCESS) ? ATMI_STATUS_SUCCESS : ATMI_STATUS_ERROR; } } // namespace core diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -48,12 +48,6 @@ extern "C" { #endif -#define check(msg, status) \ - if (status != HSA_STATUS_SUCCESS) { \ - printf("%s failed.\n", #msg); \ - exit(1); \ - } - #ifdef DEBUG #define DEBUG_PRINT(fmt, ...) \ if (core::Runtime::getInstance().getDebugMode()) { \ @@ -217,15 +211,14 @@ alignDown((intptr_t)(value + alignment - 1), alignment)); } -extern void register_allocation(void *addr, size_t size, - atmi_mem_place_t place); +hsa_status_t register_allocation(void *addr, size_t size, + atmi_mem_place_t place); extern bool atl_is_atmi_initialized(); bool handle_group_signal(hsa_signal_value_t value, void *arg); - -void allow_access_to_all_gpu_agents(void *ptr); +hsa_status_t allow_access_to_all_gpu_agents(void *ptr); } // namespace core const char *get_error_string(hsa_status_t err); diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -166,20 +166,14 @@ bool atl_is_atmi_initialized() { return g_atmi_initialized; } -void allow_access_to_all_gpu_agents(void *ptr) { - hsa_status_t err; +hsa_status_t allow_access_to_all_gpu_agents(void *ptr) { std::vector &gpu_procs = g_atl_machine.processors(); std::vector agents; for (uint32_t i = 0; i < gpu_procs.size(); i++) { agents.push_back(gpu_procs[i].agent()); } - err = hsa_amd_agents_allow_access(agents.size(), &agents[0], NULL, ptr); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Allow agents ptr access", get_error_string(err)); - exit(1); - } + return hsa_amd_agents_allow_access(agents.size(), &agents[0], NULL, ptr); } atmi_status_t Runtime::Initialize() { @@ -188,10 +182,11 @@ return ATMI_STATUS_SUCCESS; if (devtype == ATMI_DEVTYPE_ALL || devtype & ATMI_DEVTYPE_GPU) { - if (atl_init_gpu_context() != ATMI_STATUS_SUCCESS) { + atmi_status_t rc = atl_init_gpu_context(); + if (rc != ATMI_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "GPU context init", get_atmi_error_string(atl_init_gpu_context())); - exit(1); + return rc; } } @@ -200,8 +195,7 @@ } atmi_status_t Runtime::Finalize() { - hsa_status_t err; - + atmi_status_t rc = ATMI_STATUS_SUCCESS; for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) { SymbolInfoTable[i].clear(); } @@ -212,14 +206,14 @@ KernelInfoTable.clear(); atl_reset_atmi_initialized(); - err = hsa_shut_down(); + hsa_status_t err = hsa_shut_down(); if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Shutting down HSA", get_error_string(err)); - exit(1); + rc = ATMI_STATUS_ERROR; } - return ATMI_STATUS_SUCCESS; + return rc; } static void atmi_init_context_structs() { @@ -243,7 +237,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Alloc allowed in memory pool check", get_error_string(err)); - exit(1); + return err; } if (alloc_allowed) { uint32_t global_flag = 0; @@ -252,7 +246,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Get memory pool info", get_error_string(err)); - exit(1); + return err; } if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) { ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED); @@ -277,29 +271,27 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Get device type info", get_error_string(err)); - exit(1); + return err; } switch (device_type) { case HSA_DEVICE_TYPE_CPU: { - ; ATLCPUProcessor new_proc(agent); err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info, &new_proc); if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Iterate all memory pools", get_error_string(err)); - exit(1); + return err; } g_atl_machine.addProcessor(new_proc); } break; case HSA_DEVICE_TYPE_GPU: { - ; hsa_profile_t profile; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &profile); if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Query the agent profile", get_error_string(err)); - exit(1); + return err; } atmi_devtype_t gpu_type; gpu_type = @@ -310,7 +302,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Iterate all memory pools", get_error_string(err)); - exit(1); + return err; } g_atl_machine.addProcessor(new_proc); } break; @@ -368,10 +360,8 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Getting a gpu agent", get_error_string(err)); - exit(1); - } - if (err != HSA_STATUS_SUCCESS) return err; + } /* Init all devices or individual device types? */ std::vector &cpu_procs = @@ -510,7 +500,7 @@ printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Finding a CPU kernarg memory region handle", get_error_string(err)); - exit(1); + return err; } } hsa_region_t atl_gpu_kernarg_region; @@ -524,7 +514,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Finding a kernarg memory region", get_error_string(err)); - exit(1); + return err; } } if (num_procs > 0) @@ -540,7 +530,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Initializing the hsa runtime", get_error_string(err)); - exit(1); + return err; } if (err != HSA_STATUS_SUCCESS) return err; @@ -551,7 +541,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "After initializing compute and memory", get_error_string(err)); - exit(1); + return err; } int gpu_count = g_atl_machine.processorCount(); @@ -635,7 +625,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Registering the system for memory faults", get_error_string(err)); - exit(1); + return ATMI_STATUS_ERROR; } init_tasks(); @@ -1018,7 +1008,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Symbol info extraction", get_error_string(err)); - exit(1); + return err; } DEBUG_PRINT("Exec Symbol type: %d\n", type); if (type == HSA_SYMBOL_KIND_KERNEL) { @@ -1027,7 +1017,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Symbol info extraction", get_error_string(err)); - exit(1); + return err; } char *name = reinterpret_cast(malloc(name_length + 1)); err = hsa_executable_symbol_get_info(symbol, @@ -1035,7 +1025,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Symbol info extraction", get_error_string(err)); - exit(1); + return err; } name[name_length] = 0; @@ -1043,12 +1033,7 @@ // did not find kernel name in the kernel map; this can happen only // if the ROCr API for getting symbol info (name) is different from // the comgr method of getting symbol info - if (HSA_STATUS_ERROR_INVALID_CODE_OBJECT != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Invalid kernel name", - get_error_string(HSA_STATUS_ERROR_INVALID_CODE_OBJECT)); - exit(1); - } + return HSA_STATUS_ERROR; } atl_kernel_info_t info; std::string kernelName = KernelNameMap[std::string(name)]; @@ -1056,12 +1041,7 @@ // because the non-ROCr custom code object parsing is called before // iterating over the code object symbols using ROCr if (KernelInfoTable[gpu].find(kernelName) == KernelInfoTable[gpu].end()) { - if (HSA_STATUS_ERROR_INVALID_CODE_OBJECT != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Finding the entry kernel info table", - get_error_string(HSA_STATUS_ERROR_INVALID_CODE_OBJECT)); - exit(1); - } + return HSA_STATUS_ERROR; } // found, so assign and update info = KernelInfoTable[gpu][kernelName]; @@ -1074,7 +1054,7 @@ printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Extracting the symbol from the executable", get_error_string(err)); - exit(1); + return err; } err = hsa_executable_symbol_get_info( symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, @@ -1083,7 +1063,7 @@ printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Extracting the group segment size from the executable", get_error_string(err)); - exit(1); + return err; } err = hsa_executable_symbol_get_info( symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, @@ -1092,7 +1072,7 @@ printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Extracting the private segment from the executable", get_error_string(err)); - exit(1); + return err; } DEBUG_PRINT( @@ -1110,7 +1090,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Symbol info extraction", get_error_string(err)); - exit(1); + return err; } char *name = reinterpret_cast(malloc(name_length + 1)); err = hsa_executable_symbol_get_info(symbol, @@ -1118,7 +1098,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Symbol info extraction", get_error_string(err)); - exit(1); + return err; } name[name_length] = 0; @@ -1129,7 +1109,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Symbol info address extraction", get_error_string(err)); - exit(1); + return err; } err = hsa_executable_symbol_get_info( @@ -1137,14 +1117,17 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Symbol info size extraction", get_error_string(err)); - exit(1); + return err; } atmi_mem_place_t place = ATMI_MEM_PLACE(ATMI_DEVTYPE_GPU, gpu, 0); DEBUG_PRINT("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr, info.size); - register_allocation(reinterpret_cast(info.addr), (size_t)info.size, - place); + err = register_allocation(reinterpret_cast(info.addr), + (size_t)info.size, place); + if (err != HSA_STATUS_SUCCESS) { + return err; + } SymbolInfoTable[gpu][std::string(name)] = info; if (strcmp(name, "needs_hostcall_buffer") == 0) g_atmi_hostcall_required = true; @@ -1174,7 +1157,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Query the agent profile", get_error_string(err)); - exit(1); + return ATMI_STATUS_ERROR; } // FIXME: Assume that every profile is FULL until we understand how to build // GCN with base profile @@ -1185,7 +1168,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Create the executable", get_error_string(err)); - exit(1); + return ATMI_STATUS_ERROR; } bool module_load_success = false; @@ -1223,7 +1206,7 @@ printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Error in deserialized_data callback", get_atmi_error_string(atmi_err)); - exit(1); + return atmi_err; } /* Load the code object. */ @@ -1246,7 +1229,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Freeze the executable", get_error_string(err)); - exit(1); + return ATMI_STATUS_ERROR; } err = hsa_executable_iterate_symbols(executable, populate_InfoTables, @@ -1254,7 +1237,7 @@ if (err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Iterating over symbols for execuatable", get_error_string(err)); - exit(1); + return ATMI_STATUS_ERROR; } // save the executable and destroy during finalize diff --git a/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp b/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/utils.cpp @@ -99,7 +99,6 @@ std::cout << "ATMI_MAX_HSA_QUEUE_SIZE : positive integer" << std::endl << "ATMI_DEBUG : 1 for printing out trace/debug info" << std::endl; - exit(0); } var = GetEnv("ATMI_MAX_HSA_QUEUE_SIZE"); diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -72,7 +72,6 @@ int print_kernel_trace; -#undef check // Drop definition from internal.h #ifdef OMPTARGET_DEBUG #define check(msg, status) \ if (status != ATMI_STATUS_SUCCESS) { \ @@ -116,11 +115,8 @@ ~KernelArgPool() { if (kernarg_region) { auto r = hsa_amd_memory_pool_free(kernarg_region); - assert(r == HSA_STATUS_SUCCESS); if (r != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Memory pool free", get_error_string(r)); - exit(1); + DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(r)); } } } @@ -135,22 +131,33 @@ // atmi uses one pool per kernel for all gpus, with a fixed upper size // preserving that exact scheme here, including the queue - { - hsa_status_t err = hsa_amd_memory_pool_allocate( - atl_gpu_kernarg_pools[0], - kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0, - &kernarg_region); - if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Allocating memory for the executable-kernel", - get_error_string(err)); - exit(1); - } - core::allow_access_to_all_gpu_agents(kernarg_region); - for (int i = 0; i < MAX_NUM_KERNELS; i++) { - free_kernarg_segments.push(i); + hsa_status_t err = hsa_amd_memory_pool_allocate( + atl_gpu_kernarg_pools[0], + kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0, + &kernarg_region); + + if (err != HSA_STATUS_SUCCESS) { + DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(err)); + kernarg_region = nullptr; // paranoid + return; + } + + err = core::allow_access_to_all_gpu_agents(kernarg_region); + if (err != HSA_STATUS_SUCCESS) { + DP("hsa allow_access_to_all_gpu_agents failed: %s\n", + get_error_string(err)); + auto r = hsa_amd_memory_pool_free(kernarg_region); + if (r != HSA_STATUS_SUCCESS) { + // if free failed, can't do anything more to resolve it + DP("hsa memory poll free failed: %s\n", get_error_string(err)); } + kernarg_region = nullptr; + return; + } + + for (int i = 0; i < MAX_NUM_KERNELS; i++) { + free_kernarg_segments.push(i); } } @@ -474,17 +481,18 @@ NumThreads.resize(NumberOfDevices); deviceStateStore.resize(NumberOfDevices); + for (int i = 0; i < NumberOfDevices; i++) { + HSAQueues[i] = nullptr; + } + for (int i = 0; i < NumberOfDevices; i++) { uint32_t queue_size = 0; { - hsa_status_t err; - err = hsa_agent_get_info(HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE, - &queue_size); + hsa_status_t err = hsa_agent_get_info( + HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); if (err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Querying the agent maximum queue size", - get_error_string(err)); - exit(1); + DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", i); + return; } if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) { queue_size = core::Runtime::getInstance().getMaxQueueSize(); @@ -495,7 +503,7 @@ HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI, callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &HSAQueues[i]); if (rc != HSA_STATUS_SUCCESS) { - DP("Failed to create HSA queues\n"); + DP("Failed to create HSA queue %d\n", i); return; } @@ -540,18 +548,6 @@ RequiresFlags = OMP_REQ_UNDEFINED; } - void DestroyHSAExecutables() { - hsa_status_t Err; - for (uint32_t I = 0; I < HSAExecutables.size(); I++) { - Err = hsa_executable_destroy(HSAExecutables[I]); - if (Err != HSA_STATUS_SUCCESS) { - printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, - "Destroying executable", get_error_string(Err)); - return; - } - } - } - ~RTLDeviceInfoTy() { DP("Finalizing the HSA-ATMI DeviceInfo.\n"); // Run destructors on types that use HSA before @@ -561,7 +557,14 @@ // Terminate hostrpc before finalizing ATMI hostrpc_terminate(); - DestroyHSAExecutables(); + for (uint32_t I = 0; I < HSAExecutables.size(); I++) { + hsa_status_t = hsa_executable_destroy(HSAExecutables[I]); + if (Err != HSA_STATUS_SUCCESS) { + DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, + "Destroying executable", get_error_string(Err)); + } + } + atmi_finalize(); } }; @@ -1010,9 +1013,8 @@ if (rc == 0) { if (size_si.size != sizeof(uint64_t)) { - fprintf(stderr, - "Found device_State_size variable with wrong size, aborting\n"); - exit(1); + DP("Found device_State_size variable with wrong size\n"); + return 0; } // Read number of bytes directly from the elf @@ -1828,6 +1830,9 @@ // Run on the device. { hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id]; + if (!queue) { + return OFFLOAD_FAIL; + } uint64_t packet_id = acquire_available_packet_id(queue); const uint32_t mask = queue->size - 1; // size is a power of 2 @@ -1867,8 +1872,8 @@ } } if (!ArgPool) { - fprintf(stderr, "Warning: No ArgPool for %s on device %d\n", - KernelInfo->Name, device_id); + DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, + device_id); } { void *kernarg = nullptr; @@ -1877,8 +1882,8 @@ kernarg = ArgPool->allocate(arg_num); } if (!kernarg) { - printf("Allocate kernarg failed\n"); - exit(1); + DP("Allocate kernarg failed\n"); + return OFFLOAD_FAIL; } // Copy explicit arguments @@ -1919,8 +1924,8 @@ { hsa_signal_t s = DeviceInfo.FreeSignalPool.pop(); if (s.handle == 0) { - printf("Failed to get signal instance\n"); - exit(1); + DP("Failed to get signal instance\n"); + return OFFLOAD_FAIL; } packet->completion_signal = s; hsa_signal_store_relaxed(packet->completion_signal, 1);