diff --git a/mlir/include/mlir/Conversion/GPUToCUBIN/GPUToCUBINPass.h b/mlir/include/mlir/Conversion/GPUToCUBIN/GPUToCUBINPass.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Conversion/GPUToCUBIN/GPUToCUBINPass.h @@ -0,0 +1,20 @@ +//===- GPUToCUBINPass.h - Convert GPU kernel to CUBIN blob ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef MLIR_CONVERSION_GPUTOCUBIN_GPUTOCUBINPASS_H_ +#define MLIR_CONVERSION_GPUTOCUBIN_GPUTOCUBINPASS_H_ + +namespace mlir { +namespace gpu { + +// Register pass to lower a GPU kernel function to a CUBIN binary annotation. +void registerGpuKernelToCubinPass(); + +} // namespace gpu +} // namespace mlir + +#endif // MLIR_CONVERSION_GPUTOCUBIN_GPUTOCUBINPASS_H_ diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h --- a/mlir/include/mlir/Conversion/Passes.h +++ b/mlir/include/mlir/Conversion/Passes.h @@ -13,6 +13,7 @@ #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h" #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" +#include "mlir/Conversion/GPUToCUBIN/GPUToCUBINPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" #include "mlir/Conversion/GPUToSPIRV/GPUToSPIRVPass.h" diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h --- a/mlir/include/mlir/InitAllPasses.h +++ b/mlir/include/mlir/InitAllPasses.h @@ -46,6 +46,7 @@ // Conversion passes registerConversionPasses(); + gpu::registerGpuKernelToCubinPass(); // Dialect passes registerAffinePasses(); diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -3,6 +3,7 @@ add_subdirectory(AVX512ToLLVM) add_subdirectory(ComplexToLLVM) add_subdirectory(GPUCommon) +add_subdirectory(GPUToCUBIN) add_subdirectory(GPUToNVVM) add_subdirectory(GPUToROCDL) add_subdirectory(GPUToSPIRV) diff --git a/mlir/lib/Conversion/GPUToCUBIN/CMakeLists.txt b/mlir/lib/Conversion/GPUToCUBIN/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/lib/Conversion/GPUToCUBIN/CMakeLists.txt @@ -0,0 +1,16 @@ +if (MLIR_CUDA_CONVERSIONS_ENABLED) + add_mlir_conversion_library(MLIRGPUToCUBIN + LowerGPUToCUBIN.cpp + + LINK_COMPONENTS + NVPTXCodeGen + NVPTXDesc + NVPTXInfo + + LINK_LIBS PUBLIC + MLIRGPUToGPURuntimeTransforms + MLIRGPUToNVVMTransforms + MLIRLLVMToLLVMIRTranslation + MLIRNVVMToLLVMIRTranslation + ) +endif() diff --git a/mlir/lib/Conversion/GPUToCUBIN/LowerGPUToCUBIN.cpp b/mlir/lib/Conversion/GPUToCUBIN/LowerGPUToCUBIN.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Conversion/GPUToCUBIN/LowerGPUToCUBIN.cpp @@ -0,0 +1,118 @@ +//===- LowerGPUToCUBIN.cpp - Convert GPU kernel to CUBIN blob -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to generate CUBIN blob from a GPU kernel +// function. +// +//===----------------------------------------------------------------------===// +#if MLIR_CUDA_CONVERSIONS_ENABLED +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" +#include "mlir/Conversion/GPUToCUBIN/GPUToCUBINPass.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Export.h" +#include "llvm/Support/TargetSelect.h" + +#include + +using namespace mlir; + +static void emitCudaError(const llvm::Twine &expr, const char *buffer, + CUresult result, Location loc) { + const char *error; + cuGetErrorString(result, &error); + emitError(loc, expr.concat(" failed with error code ") + .concat(llvm::Twine{error}) + .concat("[") + .concat(buffer) + .concat("]")); +} + +#define RETURN_ON_CUDA_ERROR(expr) \ + do { \ + if (auto status = (expr)) { \ + emitCudaError(#expr, jitErrorBuffer, status, loc); \ + return {}; \ + } \ + } while (false) + +static OwnedBlob compilePtxToCubin(const std::string ptx, Location loc, + StringRef name) { + char jitErrorBuffer[4096] = {0}; + + RETURN_ON_CUDA_ERROR(cuInit(0)); + + // Linking requires a device context. + CUdevice device; + RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0)); + CUcontext context; + RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device)); + CUlinkState linkState; + + CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES}; + void *jitOptionsVals[] = {jitErrorBuffer, + reinterpret_cast(sizeof(jitErrorBuffer))}; + + RETURN_ON_CUDA_ERROR(cuLinkCreate(2, /* number of jit options */ + jitOptions, /* jit options */ + jitOptionsVals, /* jit option values */ + &linkState)); + + RETURN_ON_CUDA_ERROR( + cuLinkAddData(linkState, CUjitInputType::CU_JIT_INPUT_PTX, + const_cast(static_cast(ptx.c_str())), + ptx.length(), name.str().data(), /* kernel name */ + 0, /* number of jit options */ + nullptr, /* jit options */ + nullptr /* jit option values */ + )); + + void *cubinData; + size_t cubinSize; + RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize)); + + char *cubinAsChar = static_cast(cubinData); + OwnedBlob result = + std::make_unique>(cubinAsChar, cubinAsChar + cubinSize); + + // This will also destroy the cubin data. + RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState)); + RETURN_ON_CUDA_ERROR(cuCtxDestroy(context)); + + return result; +} + +static std::unique_ptr +translateModuleToNVVMIR(Operation *m, llvm::LLVMContext &llvmContext, + StringRef moduleName) { + registerLLVMDialectTranslation(*m->getContext()); + registerNVVMDialectTranslation(*m->getContext()); + return translateModuleToLLVMIR(m, llvmContext, moduleName); +} + +// Register pass to lower a GPU kernel function to a CUBIN binary annotation. +void mlir::gpu::registerGpuKernelToCubinPass() { + PassRegistration> registerGpuToCubin( + "gpu-to-cubin", "Lower GPU kernel function to CUBIN binary annotations", + [] { + // Initialize LLVM NVPTX backend. + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTargetMC(); + LLVMInitializeNVPTXAsmPrinter(); + + return createConvertGPUKernelToBlobPass( + translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda", + "sm_35", "+ptx60", "nvvm.cubin"); + }); +} +#else // MLIR_CUDA_CONVERSIONS_ENABLED +void mlir::gpu::registerGpuKernelToCubinPass() {} +#endif // MLIR_CUDA_CONVERSIONS_ENABLED diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir --- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir +++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir @@ -1,6 +1,8 @@ -// RUN: mlir-cuda-runner %s \ -// RUN: -gpu-to-cubin="gpu-binary-annotation=nvvm.cubin" \ +// RUN: mlir-opt %s --print-ir-after-all \ +// RUN: -gpu-kernel-outlining \ +// RUN: -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin)' \ // RUN: -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \ +// RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \