diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -13,10 +13,21 @@
 #include <cstring>
 #include <stddef.h>
 
+/// Generic launch parameters for configuration the number of blocks / threads.
+struct LaunchParameters {
+  uint32_t num_threads_x;
+  uint32_t num_threads_y;
+  uint32_t num_threads_z;
+  uint32_t num_blocks_x;
+  uint32_t num_blocks_y;
+  uint32_t num_blocks_z;
+};
+
 /// Generic interface to load the \p image and launch execution of the _start
 /// kernel on the target device. Copies \p argc and \p argv to the device.
 /// Returns the final value of the `main` function on the device.
-int load(int argc, char **argv, char **evnp, void *image, size_t size);
+int load(int argc, char **argv, char **evnp, void *image, size_t size,
+         const LaunchParameters &params);
 
 /// Copy the system's argument vector to GPU memory allocated using \p alloc.
 template <typename Allocator>
diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp
--- a/libc/utils/gpu/loader/Main.cpp
+++ b/libc/utils/gpu/loader/Main.cpp
@@ -15,21 +15,69 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <string>
+#include <vector>
 
 int main(int argc, char **argv, char **envp) {
   if (argc < 2) {
-    printf("USAGE: ./loader <device_image> <args>, ...\n");
+    printf("USAGE: ./loader [--threads <n>, --blocks <n>] <device_image> "
+           "<args>, ...\n");
     return EXIT_SUCCESS;
   }
 
-  // TODO: We should perform some validation on the file.
-  FILE *file = fopen(argv[1], "r");
+  int offset = 0;
+  FILE *file = nullptr;
+  char *ptr;
+  LaunchParameters params = {1, 1, 1, 1, 1, 1};
+  while (!file && ++offset < argc) {
+    if (argv[offset] == std::string("--threads") ||
+        argv[offset] == std::string("--threads-x")) {
+      params.num_threads_x =
+          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+      offset++;
+      continue;
+    } else if (argv[offset] == std::string("--threads-y")) {
+      params.num_threads_y =
+          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+      offset++;
+      continue;
+    } else if (argv[offset] == std::string("--threads-z")) {
+      params.num_threads_z =
+          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+      offset++;
+      continue;
+    } else if (argv[offset] == std::string("--blocks") ||
+               argv[offset] == std::string("--blocks-x")) {
+      params.num_blocks_x =
+          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+      offset++;
+      continue;
+    } else if (argv[offset] == std::string("--blocks-y")) {
+      params.num_blocks_y =
+          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+      offset++;
+      continue;
+    } else if (argv[offset] == std::string("--blocks-z")) {
+      params.num_blocks_z =
+          offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+      offset++;
+      continue;
+    } else {
+      file = fopen(argv[offset], "r");
+      if (!file) {
+        fprintf(stderr, "Failed to open image file '%s'\n", argv[offset]);
+        return EXIT_FAILURE;
+      }
+      break;
+    }
+  }
 
   if (!file) {
-    fprintf(stderr, "Failed to open image file %s\n", argv[1]);
+    fprintf(stderr, "No image file provided\n");
     return EXIT_FAILURE;
   }
 
+  // TODO: We should perform some validation on the file.
   fseek(file, 0, SEEK_END);
   const auto size = ftell(file);
   fseek(file, 0, SEEK_SET);
@@ -39,7 +87,7 @@
   fclose(file);
 
   // Drop the loader from the program arguments.
-  int ret = load(argc - 1, &argv[1], envp, image, size);
+  int ret = load(argc - offset, &argv[offset], envp, image, size, params);
 
   free(image);
   return ret;
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -170,7 +170,8 @@
   return iterate_agent_memory_pools(agent, cb);
 }
 
-int load(int argc, char **argv, char **envp, void *image, size_t size) {
+int load(int argc, char **argv, char **envp, void *image, size_t size,
+         const LaunchParameters &params) {
   // Initialize the HSA runtime used to communicate with the device.
   if (hsa_status_t err = hsa_init())
     handle_error(err);
@@ -355,13 +356,15 @@
   // with one thread on the device, forcing the rest of the wavefront to be
   // masked off.
   std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
-  packet->setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-  packet->workgroup_size_x = 1;
-  packet->workgroup_size_y = 1;
-  packet->workgroup_size_z = 1;
-  packet->grid_size_x = 1;
-  packet->grid_size_y = 1;
-  packet->grid_size_z = 1;
+  packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
+                   (params.num_blocks_z * params.num_threads_z != 1))
+                  << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+  packet->workgroup_size_x = params.num_threads_x;
+  packet->workgroup_size_y = params.num_threads_y;
+  packet->workgroup_size_z = params.num_threads_z;
+  packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
+  packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
+  packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
   packet->private_segment_size = private_size;
   packet->group_segment_size = group_size;
   packet->kernel_object = kernel;
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -76,7 +76,8 @@
   exit(EXIT_FAILURE);
 }
 
-int load(int argc, char **argv, char **envp, void *image, size_t size) {
+int load(int argc, char **argv, char **envp, void *image, size_t size,
+         const LaunchParameters &params) {
   if (CUresult err = cuInit(0))
     handle_error(err);
 
@@ -157,10 +158,10 @@
   server.reset(server_inbox, server_outbox, buffer);
 
   // Call the kernel with the given arguments.
-  if (CUresult err =
-          cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1,
-                         /*gridDimZ=*/1, /*blockDimX=*/1, /*blockDimY=*/1,
-                         /*bloackDimZ=*/1, 0, stream, nullptr, args_config))
+  if (CUresult err = cuLaunchKernel(
+          function, params.num_blocks_x, params.num_blocks_y,
+          params.num_blocks_z, params.num_threads_x, params.num_threads_y,
+          params.num_threads_z, 0, stream, nullptr, args_config))
     handle_error(err);
 
   // Wait until the kernel has completed execution on the device. Periodically