diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -2185,6 +2185,7 @@ packet->completion_signal = {0}; // may want a pool of signals KernelArgPool *ArgPool = nullptr; + void *kernarg = nullptr; { auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name)); if (it != KernelArgPoolMap.end()) { @@ -2196,7 +2197,6 @@ device_id); } { - void *kernarg = nullptr; if (ArgPool) { assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *))); kernarg = ArgPool->allocate(arg_num); @@ -2240,29 +2240,29 @@ packet->kernarg_address = kernarg; } - { - hsa_signal_t s = DeviceInfo.FreeSignalPool.pop(); - if (s.handle == 0) { - DP("Failed to get signal instance\n"); - return OFFLOAD_FAIL; - } - packet->completion_signal = s; - hsa_signal_store_relaxed(packet->completion_signal, 1); + hsa_signal_t s = DeviceInfo.FreeSignalPool.pop(); + if (s.handle == 0) { + DP("Failed to get signal instance\n"); + return OFFLOAD_FAIL; } + packet->completion_signal = s; + hsa_signal_store_relaxed(packet->completion_signal, 1); + // Publish the packet indicating it is ready to be processed core::packet_store_release(reinterpret_cast(packet), core::create_header(), packet->setup); + // Since the packet is already published, its contents must not be + // accessed any more hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); - while (hsa_signal_wait_scacquire(packet->completion_signal, - HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, + while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) ; assert(ArgPool); - ArgPool->deallocate(packet->kernarg_address); - DeviceInfo.FreeSignalPool.push(packet->completion_signal); + ArgPool->deallocate(kernarg); + DeviceInfo.FreeSignalPool.push(s); } DP("Kernel completed\n");