diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h --- a/mlir/include/mlir/Dialect/GPU/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Passes.h @@ -53,15 +53,18 @@ void runOnOperation() final; +protected: + void getDependentDialects(DialectRegistry ®istry) const override; + private: - // Creates the LLVM target machine to generate the ISA. + /// Creates the LLVM target machine to generate the ISA. std::unique_ptr createTargetMachine(); - // Translates the 'getOperation()' result to an LLVM module. + /// Translates the 'getOperation()' result to an LLVM module. virtual std::unique_ptr - translateToLLVMIR(llvm::LLVMContext &llvmContext) = 0; + translateToLLVMIR(llvm::LLVMContext &llvmContext); - // Serializes the target ISA to binary form. + /// Serializes the target ISA to binary form. virtual std::unique_ptr> serializeISA(const std::string &isa) = 0; @@ -83,6 +86,10 @@ // Registration //===----------------------------------------------------------------------===// +/// Register pass to serialize GPU kernel functions to a CUBIN binary +/// annotation. +void registerGpuSerializeToCubinPass(); + /// Generate the code for registering passes. #define GEN_PASS_REGISTRATION #include "mlir/Dialect/GPU/Passes.h.inc" diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h --- a/mlir/include/mlir/InitAllPasses.h +++ b/mlir/include/mlir/InitAllPasses.h @@ -51,6 +51,7 @@ registerAffinePasses(); registerAsyncPasses(); registerGPUPasses(); + registerGpuSerializeToCubinPass(); registerLinalgPasses(); LLVM::registerLLVMPasses(); quant::registerQuantPasses(); diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt --- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt @@ -24,6 +24,8 @@ intrinsics_gen LINK_COMPONENTS + Core + MC ${AMDGPU_LIBS} ${NVPTX_LIBS} diff --git a/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp b/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp --- a/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp +++ b/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp @@ -61,6 +61,8 @@ private: // Translates the 'getOperation()' result to an LLVM module. + // Note: when this class is removed, this function no longer needs to be + // virtual. std::unique_ptr translateToLLVMIR(llvm::LLVMContext &llvmContext) override { return loweringCallback(getOperation(), llvmContext, "LLVMDialectModule"); diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -1,3 +1,11 @@ +if (MLIR_CUDA_CONVERSIONS_ENABLED) + set(NVPTX_LIBS + NVPTXCodeGen + NVPTXDesc + NVPTXInfo + ) +endif() + add_mlir_dialect_library(MLIRGPU IR/GPUDialect.cpp Transforms/AllReduceLowering.cpp @@ -6,6 +14,7 @@ Transforms/MemoryPromotion.cpp Transforms/ParallelLoopMapper.cpp Transforms/SerializeToBlob.cpp + Transforms/SerializeToCubin.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU @@ -13,6 +22,7 @@ LINK_COMPONENTS Core MC + ${NVPTX_LIBS} DEPENDS MLIRGPUOpsIncGen @@ -26,6 +36,7 @@ MLIREDSC MLIRIR MLIRLLVMIR + MLIRLLVMToLLVMIRTranslation MLIRSCF MLIRPass MLIRSideEffectInterfaces @@ -33,3 +44,42 @@ MLIRSupport MLIRTransformUtils ) + +if(MLIR_CUDA_RUNNER_ENABLED) + if(NOT MLIR_CUDA_CONVERSIONS_ENABLED) + message(SEND_ERROR + "Building mlir with cuda support requires the NVPTX backend") + endif() + + # Configure CUDA language support. Using check_language first allows us to + # give a custom error message. + include(CheckLanguage) + check_language(CUDA) + if (CMAKE_CUDA_COMPILER) + enable_language(CUDA) + else() + message(SEND_ERROR + "Building mlir with cuda support requires a working CUDA install") + endif() + + # Enable gpu-to-cubin pass. + target_compile_definitions(obj.MLIRGPU + PRIVATE + MLIR_GPU_TO_CUBIN_PASS_ENABLE=1 + ) + + # Add CUDA headers includes and the libcuda.so library. + target_include_directories(obj.MLIRGPU + PRIVATE + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} + ) + + find_library(CUDA_DRIVER_LIBRARY cuda) + + target_link_libraries(MLIRGPU + PRIVATE + MLIRNVVMToLLVMIRTranslation + ${CUDA_DRIVER_LIBRARY} + ) + +endif() diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp --- a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp @@ -14,6 +14,8 @@ #include "mlir/Dialect/GPU/Passes.h" #include "mlir/Pass/Pass.h" +#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Export.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" @@ -68,6 +70,12 @@ getOperation()->setAttr(gpuBinaryAnnotation, attr); } +void gpu::SerializeToBlobPass::getDependentDialects( + DialectRegistry ®istry) const { + registerLLVMDialectTranslation(registry); + OperationPass::getDependentDialects(registry); +} + std::unique_ptr gpu::SerializeToBlobPass::createTargetMachine() { Location loc = getOperation().getLoc(); @@ -87,3 +95,9 @@ return std::unique_ptr{machine}; } + +std::unique_ptr +gpu::SerializeToBlobPass::translateToLLVMIR(llvm::LLVMContext &llvmContext) { + return translateModuleToLLVMIR(getOperation(), llvmContext, + "LLVMDialectModule"); +} diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp @@ -0,0 +1,142 @@ +//===- LowerGPUToCUBIN.cpp - Convert GPU kernel to CUBIN blob -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass that serializes a gpu module into CUBIN blob and +// adds that blob as a string attribute of the module. +// +//===----------------------------------------------------------------------===// +#include "mlir/Dialect/GPU/Passes.h" + +#if MLIR_GPU_TO_CUBIN_PASS_ENABLE +#include "mlir/Pass/Pass.h" +#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Export.h" +#include "llvm/Support/TargetSelect.h" + +#include + +using namespace mlir; + +static void emitCudaError(const llvm::Twine &expr, const char *buffer, + CUresult result, Location loc) { + const char *error; + cuGetErrorString(result, &error); + emitError(loc, expr.concat(" failed with error code ") + .concat(llvm::Twine{error}) + .concat("[") + .concat(buffer) + .concat("]")); +} + +#define RETURN_ON_CUDA_ERROR(expr) \ + do { \ + if (auto status = (expr)) { \ + emitCudaError(#expr, jitErrorBuffer, status, loc); \ + return {}; \ + } \ + } while (false) + +namespace { +class SerializeToCubinPass + : public PassWrapper { +public: + SerializeToCubinPass(); + +private: + void getDependentDialects(DialectRegistry ®istry) const override; + + // Serializes PTX to CUBIN. + std::unique_ptr> + serializeISA(const std::string &isa) override; +}; +} // namespace + +// Sets the 'option' to 'value' unless it already has a value. +static void maybeSetOption(Pass::Option &option, + const char *value) { + if (!option.hasValue()) + option = value; +} + +SerializeToCubinPass::SerializeToCubinPass() { + maybeSetOption(this->triple, "nvptx64-nvidia-cuda"); + maybeSetOption(this->chip, "sm_35"); + maybeSetOption(this->features, "+ptx60"); +} + +void SerializeToCubinPass::getDependentDialects( + DialectRegistry ®istry) const { + registerNVVMDialectTranslation(registry); + gpu::SerializeToBlobPass::getDependentDialects(registry); +} + +std::unique_ptr> +SerializeToCubinPass::serializeISA(const std::string &isa) { + Location loc = getOperation().getLoc(); + char jitErrorBuffer[4096] = {0}; + + RETURN_ON_CUDA_ERROR(cuInit(0)); + + // Linking requires a device context. + CUdevice device; + RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0)); + CUcontext context; + RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device)); + CUlinkState linkState; + + CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES}; + void *jitOptionsVals[] = {jitErrorBuffer, + reinterpret_cast(sizeof(jitErrorBuffer))}; + + RETURN_ON_CUDA_ERROR(cuLinkCreate(2, /* number of jit options */ + jitOptions, /* jit options */ + jitOptionsVals, /* jit option values */ + &linkState)); + + auto kernelName = getOperation().getName().str(); + RETURN_ON_CUDA_ERROR(cuLinkAddData( + linkState, CUjitInputType::CU_JIT_INPUT_PTX, + const_cast(static_cast(isa.c_str())), isa.length(), + kernelName.c_str(), 0, /* number of jit options */ + nullptr, /* jit options */ + nullptr /* jit option values */ + )); + + void *cubinData; + size_t cubinSize; + RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize)); + + char *cubinAsChar = static_cast(cubinData); + auto result = + std::make_unique>(cubinAsChar, cubinAsChar + cubinSize); + + // This will also destroy the cubin data. + RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState)); + RETURN_ON_CUDA_ERROR(cuCtxDestroy(context)); + + return result; +} + +// Register pass to serialize GPU kernel functions to a CUBIN binary annotation. +void mlir::registerGpuSerializeToCubinPass() { + PassRegistration registerSerializeToCubin( + "gpu-to-cubin", "Lower GPU kernel function to CUBIN binary annotations", + [] { + // Initialize LLVM NVPTX backend. + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTargetMC(); + LLVMInitializeNVPTXAsmPrinter(); + + return std::make_unique(); + }); +} +#else // MLIR_GPU_TO_CUBIN_PASS_ENABLE +void mlir::registerGpuSerializeToCubinPass() {} +#endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir --- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir +++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir @@ -1,6 +1,8 @@ -// RUN: mlir-cuda-runner %s \ -// RUN: -gpu-to-cubin="gpu-binary-annotation=nvvm.cubin" \ +// RUN: mlir-opt %s \ +// RUN: -gpu-kernel-outlining \ +// RUN: -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{gpu-binary-annotation=nvvm.cubin})' \ // RUN: -gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" \ +// RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \