diff --git a/mlir/test/mlir-cuda-runner/all-reduce-and.mlir b/mlir/test/mlir-cuda-runner/all-reduce-and.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-and.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-and.mlir @@ -1,4 +1,5 @@ // RUN: mlir-cuda-runner %s \ +// RUN: -gpu-to-cubin -gpu-to-cuda-runtime \ // RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/mlir-cuda-runner/all-reduce-max.mlir b/mlir/test/mlir-cuda-runner/all-reduce-max.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-max.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-max.mlir @@ -1,4 +1,5 @@ // RUN: mlir-cuda-runner %s \ +// RUN: -gpu-to-cubin -gpu-to-cuda-runtime \ // RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/mlir-cuda-runner/all-reduce-min.mlir b/mlir/test/mlir-cuda-runner/all-reduce-min.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-min.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-min.mlir @@ -1,4 +1,5 @@ // RUN: mlir-cuda-runner %s \ +// RUN: -gpu-to-cubin -gpu-to-cuda-runtime \ // RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir @@ -1,4 +1,5 @@ // RUN: mlir-cuda-runner %s \ +// RUN: -gpu-to-cubin -gpu-to-cuda-runtime \ // RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/mlir-cuda-runner/all-reduce-or.mlir b/mlir/test/mlir-cuda-runner/all-reduce-or.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-or.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-or.mlir @@ -1,4 +1,5 @@ // RUN: mlir-cuda-runner %s \ +// RUN: -gpu-to-cubin -gpu-to-cuda-runtime \ // RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir @@ -1,4 +1,5 @@ // RUN: mlir-cuda-runner %s \ +// RUN: -gpu-to-cubin -gpu-to-cuda-runtime \ // RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir b/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir --- a/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir @@ -1,4 +1,5 @@ // RUN: mlir-cuda-runner %s \ +// RUN: -gpu-to-cubin -gpu-to-cuda-runtime \ // RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/mlir-cuda-runner/async.mlir b/mlir/test/mlir-cuda-runner/async.mlir --- a/mlir/test/mlir-cuda-runner/async.mlir +++ b/mlir/test/mlir-cuda-runner/async.mlir @@ -1,7 +1,10 @@ -// RUN: mlir-cuda-runner %s --entry-point-result=void -O0 \ +// RUN: mlir-cuda-runner %s \ +// RUN: -gpu-to-cubin -gpu-to-async -gpu-to-cuda-runtime \ +// RUN: -async-to-async-runtime -convert-async-to-llvm -convert-std-to-llvm \ // RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_async_runtime%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ +// RUN: --entry-point-result=void -O0 \ // RUN: | FileCheck %s func @main() { diff --git a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir --- a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir +++ b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir @@ -1,4 +1,5 @@ // RUN: mlir-cuda-runner %s \ +// RUN: -gpu-to-cubin -gpu-to-cuda-runtime \ // RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir b/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir --- a/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir +++ b/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir @@ -1,4 +1,5 @@ // RUN: mlir-cuda-runner %s \ +// RUN: -gpu-to-cubin -gpu-to-cuda-runtime \ // RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/mlir-cuda-runner/shuffle.mlir b/mlir/test/mlir-cuda-runner/shuffle.mlir --- a/mlir/test/mlir-cuda-runner/shuffle.mlir +++ b/mlir/test/mlir-cuda-runner/shuffle.mlir @@ -1,4 +1,5 @@ // RUN: mlir-cuda-runner %s \ +// RUN: -gpu-to-cubin -gpu-to-cuda-runtime \ // RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/mlir-cuda-runner/two-modules.mlir b/mlir/test/mlir-cuda-runner/two-modules.mlir --- a/mlir/test/mlir-cuda-runner/two-modules.mlir +++ b/mlir/test/mlir-cuda-runner/two-modules.mlir @@ -1,4 +1,5 @@ // RUN: mlir-cuda-runner %s \ +// RUN: -gpu-to-cubin -gpu-to-cuda-runtime \ // RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp --- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp +++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp @@ -14,11 +14,9 @@ #include "llvm/ADT/STLExtras.h" -#include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" -#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" -#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" +#include "mlir/Conversion/Passes.h" #include "mlir/Dialect/Async/IR/Async.h" #include "mlir/Dialect/Async/Passes.h" #include "mlir/Dialect/GPU/GPUDialect.h" @@ -44,35 +42,36 @@ using namespace mlir; -inline void emit_cuda_error(const llvm::Twine &message, const char *buffer, - CUresult error, Location loc) { - emitError(loc, message.concat(" failed with error code ") +static void emitCudaError(const llvm::Twine &expr, const char *buffer, + CUresult result, Location loc) { + const char *error; + cuGetErrorString(result, &error); + emitError(loc, expr.concat(" failed with error code ") .concat(llvm::Twine{error}) .concat("[") .concat(buffer) .concat("]")); } -#define RETURN_ON_CUDA_ERROR(expr, msg) \ - { \ - auto _cuda_error = (expr); \ - if (_cuda_error != CUDA_SUCCESS) { \ - emit_cuda_error(msg, jitErrorBuffer, _cuda_error, loc); \ +#define RETURN_ON_CUDA_ERROR(expr) \ + do { \ + if (auto status = (expr)) { \ + emitCudaError(#expr, jitErrorBuffer, status, loc); \ return {}; \ } \ - } + } while (false) OwnedBlob compilePtxToCubin(const std::string ptx, Location loc, StringRef name) { char jitErrorBuffer[4096] = {0}; - RETURN_ON_CUDA_ERROR(cuInit(0), "cuInit"); + RETURN_ON_CUDA_ERROR(cuInit(0)); // Linking requires a device context. CUdevice device; - RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0), "cuDeviceGet"); + RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0)); CUcontext context; - RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device), "cuCtxCreate"); + RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device)); CUlinkState linkState; CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER, @@ -83,8 +82,7 @@ RETURN_ON_CUDA_ERROR(cuLinkCreate(2, /* number of jit options */ jitOptions, /* jit options */ jitOptionsVals, /* jit option values */ - &linkState), - "cuLinkCreate"); + &linkState)); RETURN_ON_CUDA_ERROR( cuLinkAddData(linkState, CUjitInputType::CU_JIT_INPUT_PTX, @@ -93,51 +91,74 @@ 0, /* number of jit options */ nullptr, /* jit options */ nullptr /* jit option values */ - ), - "cuLinkAddData"); + )); void *cubinData; size_t cubinSize; - RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize), - "cuLinkComplete"); + RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize)); char *cubinAsChar = static_cast(cubinData); OwnedBlob result = std::make_unique>(cubinAsChar, cubinAsChar + cubinSize); // This will also destroy the cubin data. - RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState), "cuLinkDestroy"); - RETURN_ON_CUDA_ERROR(cuCtxDestroy(context), "cuCtxDestroy"); + RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState)); + RETURN_ON_CUDA_ERROR(cuCtxDestroy(context)); return result; } -static LogicalResult runMLIRPasses(ModuleOp m) { - PassManager pm(m.getContext()); +// Register cuda-runner specific passes. +static void registerCudaRunnerPasses() { + const char gpuBinaryAnnotation[] = "nvvm.cubin"; + static PassPipelineRegistration<> registerToCubin( + "gpu-to-cubin", "Generate CUBIN from gpu.launch regions", + [&](OpPassManager &pm) { + pm.addPass(createGpuKernelOutliningPass()); + auto &kernelPm = pm.nest(); + kernelPm.addPass(createStripDebugInfoPass()); + kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass()); + kernelPm.addPass(createConvertGPUKernelToBlobPass( + translateModuleToLLVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda", + "sm_35", "+ptx60", gpuBinaryAnnotation)); + }); + static PassPipelineRegistration<> registerToAsync( + "gpu-to-async", "Make GPU operations asynchronous", + [&](OpPassManager &pm) { + auto &funcPm = pm.nest(); + funcPm.addPass(createGpuAsyncRegionPass()); + funcPm.addPass(createAsyncRefCountingPass()); + }); + registerPass("gpu-to-cuda-runtime", + "Generate CUDA runtime wrapper calls from GPU operations", + [&]() -> std::unique_ptr { + return createGpuToLLVMConversionPass(gpuBinaryAnnotation); + }); + registerGPUPasses(); + registerAsyncPasses(); + registerConvertAsyncToLLVMPass(); + registerConvertStandardToLLVMPass(); +} + +static LogicalResult runMLIRPasses(ModuleOp module, + PassPipelineCLParser &passPipeline) { + PassManager pm(module.getContext()); applyPassManagerCLOptions(pm); - const char gpuBinaryAnnotation[] = "nvvm.cubin"; - pm.addPass(createGpuKernelOutliningPass()); - auto &kernelPm = pm.nest(); - kernelPm.addPass(createStripDebugInfoPass()); - kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass()); - kernelPm.addPass(createConvertGPUKernelToBlobPass( - translateModuleToLLVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda", - "sm_35", "+ptx60", gpuBinaryAnnotation)); - auto &funcPm = pm.nest(); - funcPm.addPass(createGpuAsyncRegionPass()); - funcPm.addPass(createAsyncRefCountingPass()); - pm.addPass(createGpuToLLVMConversionPass(gpuBinaryAnnotation)); - pm.addPass(createAsyncToAsyncRuntimePass()); - pm.addPass(createConvertAsyncToLLVMPass()); - mlir::LowerToLLVMOptions lower_to_llvm_opts; - pm.addPass(mlir::createLowerToLLVMPass(lower_to_llvm_opts)); - - return pm.run(m); + auto errorHandler = [&](const Twine &msg) { + emitError(UnknownLoc::get(module.getContext())) << msg; + return failure(); + }; + + // Build the provided pipeline. + if (failed(passPipeline.addToPipeline(pm, errorHandler))) + return failure(); + + // Run the pipeline. + return pm.run(module); } int main(int argc, char **argv) { - registerPassManagerCLOptions(); llvm::InitLLVM y(argc, argv); llvm::InitializeNativeTarget(); llvm::InitializeNativeTargetAsmPrinter(); @@ -150,8 +171,16 @@ mlir::initializeLLVMPasses(); + registerCudaRunnerPasses(); + PassPipelineCLParser passPipeline("", "Compiler passes to run"); + registerPassManagerCLOptions(); + + auto mlirTransformer = [&](ModuleOp module) { + return runMLIRPasses(module, passPipeline); + }; + mlir::JitRunnerConfig jitRunnerConfig; - jitRunnerConfig.mlirTransformer = runMLIRPasses; + jitRunnerConfig.mlirTransformer = mlirTransformer; mlir::DialectRegistry registry; registry.insert