diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h --- a/libc/utils/gpu/loader/Loader.h +++ b/libc/utils/gpu/loader/Loader.h @@ -13,10 +13,17 @@ #include #include +/// Generic launch parameters for configuration the number of blocks / threads. +struct LaunchParameters { + uint32_t num_threads_x; + uint32_t num_blocks_x; +}; + /// Generic interface to load the \p image and launch execution of the _start /// kernel on the target device. Copies \p argc and \p argv to the device. /// Returns the final value of the `main` function on the device. -int load(int argc, char **argv, char **evnp, void *image, size_t size); +int load(int argc, char **argv, char **evnp, void *image, size_t size, + const LaunchParameters ¶ms); /// Copy the system's argument vector to GPU memory allocated using \p alloc. template diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp --- a/libc/utils/gpu/loader/Main.cpp +++ b/libc/utils/gpu/loader/Main.cpp @@ -15,18 +15,33 @@ #include #include +#include +#include int main(int argc, char **argv, char **envp) { if (argc < 2) { - printf("USAGE: ./loader , ...\n"); + printf("USAGE: ./loader [--threads , --blocks ] " + ", ...\n"); return EXIT_SUCCESS; } // TODO: We should perform some validation on the file. - FILE *file = fopen(argv[1], "r"); + int offset = 0; + FILE *file; + LaunchParameters params = {1, 1}; + while (++offset < argc) { + file = fopen(argv[offset], "r"); + if (file || offset + 1 >= argc) + break; + + if (argv[offset] == std::string("--threads")) + params.num_threads_x = atoi(argv[offset + 1]); + else if (argv[offset] == std::string("--blocks")) + params.num_blocks_x = atoi(argv[offset + 1]); + } if (!file) { - fprintf(stderr, "Failed to open image file %s\n", argv[1]); + fprintf(stderr, "Failed to open image file '%s\n'", argv[offset]); return EXIT_FAILURE; } @@ -39,7 +54,7 @@ fclose(file); // Drop the loader from the program arguments. - int ret = load(argc - 1, &argv[1], envp, image, size); + int ret = load(argc - offset, &argv[offset], envp, image, size, params); free(image); return ret; diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -145,7 +145,8 @@ return iterate_agent_memory_pools(agent, cb); } -int load(int argc, char **argv, char **envp, void *image, size_t size) { +int load(int argc, char **argv, char **envp, void *image, size_t size, + const LaunchParameters ¶ms) { // Initialize the HSA runtime used to communicate with the device. if (hsa_status_t err = hsa_init()) handle_error(err); @@ -331,10 +332,10 @@ // masked off. std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t)); packet->setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - packet->workgroup_size_x = 1; + packet->workgroup_size_x = params.num_threads_x; packet->workgroup_size_y = 1; packet->workgroup_size_z = 1; - packet->grid_size_x = 1; + packet->grid_size_x = params.num_blocks_x * params.num_threads_x; packet->grid_size_y = 1; packet->grid_size_z = 1; packet->private_segment_size = private_size; diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp --- a/libc/utils/gpu/loader/nvptx/Loader.cpp +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -51,7 +51,8 @@ exit(EXIT_FAILURE); } -int load(int argc, char **argv, char **envp, void *image, size_t size) { +int load(int argc, char **argv, char **envp, void *image, size_t size, + const LaunchParameters ¶ms) { if (CUresult err = cuInit(0)) handle_error(err); @@ -133,8 +134,8 @@ // Call the kernel with the given arguments. if (CUresult err = - cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1, - /*gridDimZ=*/1, /*blockDimX=*/1, /*blockDimY=*/1, + cuLaunchKernel(function, params.num_blocks_x, /*gridDimY=*/1, + /*gridDimZ=*/1, params.num_threads_x, /*blockDimY=*/1, /*bloackDimZ=*/1, 0, stream, nullptr, args_config)) handle_error(err);