diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h --- a/libc/utils/gpu/loader/Loader.h +++ b/libc/utils/gpu/loader/Loader.h @@ -13,10 +13,21 @@ #include #include +/// Generic launch parameters for configuration the number of blocks / threads. +struct LaunchParameters { + uint32_t num_threads_x; + uint32_t num_threads_y; + uint32_t num_threads_z; + uint32_t num_blocks_x; + uint32_t num_blocks_y; + uint32_t num_blocks_z; +}; + /// Generic interface to load the \p image and launch execution of the _start /// kernel on the target device. Copies \p argc and \p argv to the device. /// Returns the final value of the `main` function on the device. -int load(int argc, char **argv, char **evnp, void *image, size_t size); +int load(int argc, char **argv, char **evnp, void *image, size_t size, + const LaunchParameters ¶ms); /// Copy the system's argument vector to GPU memory allocated using \p alloc. template diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp --- a/libc/utils/gpu/loader/Main.cpp +++ b/libc/utils/gpu/loader/Main.cpp @@ -15,21 +15,69 @@ #include #include +#include +#include int main(int argc, char **argv, char **envp) { if (argc < 2) { - printf("USAGE: ./loader , ...\n"); + printf("USAGE: ./loader [--threads , --blocks ] " + ", ...\n"); return EXIT_SUCCESS; } - // TODO: We should perform some validation on the file. - FILE *file = fopen(argv[1], "r"); + int offset = 0; + FILE *file = nullptr; + char *ptr; + LaunchParameters params = {1, 1, 1, 1, 1, 1}; + while (!file && ++offset < argc) { + if (argv[offset] == std::string("--threads") || + argv[offset] == std::string("--threads-x")) { + params.num_threads_x = + offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1; + offset++; + continue; + } else if (argv[offset] == std::string("--threads-y")) { + params.num_threads_y = + offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1; + offset++; + continue; + } else if (argv[offset] == std::string("--threads-z")) { + params.num_threads_z = + offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1; + offset++; + continue; + } else if (argv[offset] == std::string("--blocks") || + argv[offset] == std::string("--blocks-x")) { + params.num_blocks_x = + offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1; + offset++; + continue; + } else if (argv[offset] == std::string("--blocks-y")) { + params.num_blocks_y = + offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1; + offset++; + continue; + } else if (argv[offset] == std::string("--blocks-z")) { + params.num_blocks_z = + offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1; + offset++; + continue; + } else { + file = fopen(argv[offset], "r"); + if (!file) { + fprintf(stderr, "Failed to open image file '%s'\n", argv[offset]); + return EXIT_FAILURE; + } + break; + } + } if (!file) { - fprintf(stderr, "Failed to open image file %s\n", argv[1]); + fprintf(stderr, "No image file provided\n"); return EXIT_FAILURE; } + // TODO: We should perform some validation on the file. fseek(file, 0, SEEK_END); const auto size = ftell(file); fseek(file, 0, SEEK_SET); @@ -39,7 +87,7 @@ fclose(file); // Drop the loader from the program arguments. - int ret = load(argc - 1, &argv[1], envp, image, size); + int ret = load(argc - offset, &argv[offset], envp, image, size, params); free(image); return ret; diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -170,7 +170,8 @@ return iterate_agent_memory_pools(agent, cb); } -int load(int argc, char **argv, char **envp, void *image, size_t size) { +int load(int argc, char **argv, char **envp, void *image, size_t size, + const LaunchParameters ¶ms) { // Initialize the HSA runtime used to communicate with the device. if (hsa_status_t err = hsa_init()) handle_error(err); @@ -355,13 +356,15 @@ // with one thread on the device, forcing the rest of the wavefront to be // masked off. std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t)); - packet->setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - packet->workgroup_size_x = 1; - packet->workgroup_size_y = 1; - packet->workgroup_size_z = 1; - packet->grid_size_x = 1; - packet->grid_size_y = 1; - packet->grid_size_z = 1; + packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) + + (params.num_blocks_z * params.num_threads_z != 1)) + << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + packet->workgroup_size_x = params.num_threads_x; + packet->workgroup_size_y = params.num_threads_y; + packet->workgroup_size_z = params.num_threads_z; + packet->grid_size_x = params.num_blocks_x * params.num_threads_x; + packet->grid_size_y = params.num_blocks_y * params.num_threads_y; + packet->grid_size_z = params.num_blocks_z * params.num_threads_z; packet->private_segment_size = private_size; packet->group_segment_size = group_size; packet->kernel_object = kernel; diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp --- a/libc/utils/gpu/loader/nvptx/Loader.cpp +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -76,7 +76,8 @@ exit(EXIT_FAILURE); } -int load(int argc, char **argv, char **envp, void *image, size_t size) { +int load(int argc, char **argv, char **envp, void *image, size_t size, + const LaunchParameters ¶ms) { if (CUresult err = cuInit(0)) handle_error(err); @@ -157,10 +158,10 @@ server.reset(server_inbox, server_outbox, buffer); // Call the kernel with the given arguments. - if (CUresult err = - cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1, - /*gridDimZ=*/1, /*blockDimX=*/1, /*blockDimY=*/1, - /*bloackDimZ=*/1, 0, stream, nullptr, args_config)) + if (CUresult err = cuLaunchKernel( + function, params.num_blocks_x, params.num_blocks_y, + params.num_blocks_z, params.num_threads_x, params.num_threads_y, + params.num_threads_z, 0, stream, nullptr, args_config)) handle_error(err); // Wait until the kernel has completed execution on the device. Periodically