diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -221,6 +221,10 @@
              /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
     handle_server();
 
+  // Handle the server one more time in case the kernel exited with a pending
+  // send still in flight.
+  handle_server();
+
   // Destroy the resources acquired to launch the kernel and return.
   if (hsa_status_t err = hsa_amd_memory_pool_free(args))
     handle_error(err);
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -186,6 +186,10 @@
   while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
     handle_server();
 
+  // Handle the server one more time in case the kernel exited with a pending
+  // send still in flight.
+  handle_server();
+
   return CUDA_SUCCESS;
 }