diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -221,6 +221,10 @@ /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0) handle_server(); + // Handle the server one more time in case the kernel exited with a pending + // send still in flight. + handle_server(); + // Destroy the resources acquired to launch the kernel and return. if (hsa_status_t err = hsa_amd_memory_pool_free(args)) handle_error(err); diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp --- a/libc/utils/gpu/loader/nvptx/Loader.cpp +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -186,6 +186,10 @@ while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY) handle_server(); + // Handle the server one more time in case the kernel exited with a pending + // send still in flight. + handle_server(); + return CUDA_SUCCESS; }