diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -70,6 +70,32 @@ } namespace gpu { + +/// Options for Serialization +struct SerializationToCubinOptions { + /// LLVM target triple + std::string triple; + + /// SM Architecture of the GPU + std::string chip; + + /// PTX version that is wanted to produce + std::string features; + + /// Optimization level + int optLevel = 2; + + /// Dump generated PTX to stderr for debug purposes + bool dumpPtx = false; + + /// Compiles generated PTX by ptxas compiler. When it is false, the generated + /// PTX is compilet by JIT compielr by the driver. + bool usePtxas = true; + + /// Parameters to pass ptxas compiler. It is ignored for JIT compiler. + std::string ptxasParams; +}; + /// Base pass class to serialize kernel functions through LLVM into /// user-specified IR and add the resulting blob as module attribute. class SerializeToBlobPass : public OperationPass { @@ -117,9 +143,18 @@ *this, "gpu-binary-annotation", llvm::cl::desc("Annotation attribute string for GPU binary"), llvm::cl::init(getDefaultGpuBinaryAnnotation())}; + Option dumpPtx{*this, "dump-ptx", ::llvm::cl::desc("Dump generated PTX"), llvm::cl::init(false)}; + + Option usePtxas{ + *this, "use-ptxas", + ::llvm::cl::desc("Compile generated PTX by ptxas compiler"), + llvm::cl::init(true)}; + Option ptxasParams{ + *this, "ptxas-params", + ::llvm::cl::desc("Parameters to pass ptxas compiler")}; }; } // namespace gpu @@ -137,11 +172,8 @@ /// Create an instance of the GPU kernel function to CUBIN binary serialization /// pass with optLevel (default level 2). -std::unique_ptr createGpuSerializeToCubinPass(StringRef triple, - StringRef chip, - StringRef features, - int optLevel = 2, - bool dumpPtx = false); +std::unique_ptr +createGpuSerializeToCubinPass(const gpu::SerializationToCubinOptions &options); /// Create an instance of the GPU kernel function to HSAco binary serialization /// pass. diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp --- a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp @@ -12,7 +12,14 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/GPU/Transforms/Passes.h" -#include "llvm/Support/Debug.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/WithColor.h" +#include "llvm/Support/raw_ostream.h" #if MLIR_GPU_TO_CUBIN_PASS_ENABLE #include "mlir/Pass/Pass.h" @@ -36,6 +43,106 @@ .concat("]")); } +static constexpr char kPtxasCompilerName[] = "ptxas"; + +/// Compiles the given generated PTX code with the given ptxas compiler. +static FailureOr +compileWithPtxas(StringRef smCapability, StringRef ptxasParams, + StringRef ptxSource, bool dumpPtx, std::string *message) { + // Step 0. Find ptxas compiler + std::optional ptxasCompiler = + llvm::sys::Process::FindInEnvPath("PATH", kPtxasCompilerName); + if (!ptxasCompiler.has_value()) + return failure(); + + // Step 1. Create temporary files: ptx source file, log file and cubin file + llvm::SmallString<64> ptxSourceFile, stdinFile, stdoutFile, stderrFile; + llvm::sys::fs::createTemporaryFile("mlir-ptx", "", ptxSourceFile); + llvm::sys::fs::createTemporaryFile("ptxas-stdin", "", stdinFile); + llvm::sys::fs::createTemporaryFile("ptxas-stdout", "", stdoutFile); + llvm::sys::fs::createTemporaryFile("ptxas-stderr", "", stderrFile); + std::string cubinFile = std::string(ptxSourceFile) + ".cubin"; + llvm::FileRemover stdinRemover(stdinFile.c_str()); + llvm::FileRemover stdoutRemover(stdoutFile.c_str()); + llvm::FileRemover stderrRemover(stderrFile.c_str()); + llvm::FileRemover binRemover(cubinFile.c_str()); + llvm::FileRemover srcRemover(ptxSourceFile.c_str()); + + // Step 2. Write the generated PTX into a file, so we can pass it to ptxas + // compiler + std::error_code ec; + llvm::raw_fd_ostream fPtxSource(ptxSourceFile, ec); + fPtxSource << ptxSource; + fPtxSource.close(); + if (fPtxSource.has_error()) { + *message = std::string( + "Could not write the generated ptx into a temporary file\n"); + return failure(); + } + + // Step 3. Build the ptxas command line + std::vector argVector{StringRef("ptxas"), StringRef("-arch"), + smCapability, StringRef(ptxSourceFile), + StringRef("-o"), StringRef(cubinFile)}; +#ifdef _WIN32 + auto tokenize = llvm::cl::TokenizeWindowsCommandLine; +#else + auto tokenize = llvm::cl::TokenizeGNUCommandLine; +#endif // _WIN32 + llvm::BumpPtrAllocator scratchAllocator; + llvm::StringSaver stringSaver(scratchAllocator); + SmallVector rawArgs; + tokenize(ptxasParams, stringSaver, rawArgs, /*MarkEOLs=*/false); + for (const auto *rawArg : rawArgs) + argVector.emplace_back(rawArg); + + std::optional redirects[] = { + stdinFile.str(), + stdoutFile.str(), + stderrFile.str(), + }; + + // Step 4. Invoke ptxas + if (llvm::sys::ExecuteAndWait(ptxasCompiler.value(), + llvm::ArrayRef(argVector), + /*Env=*/std::nullopt, + /*Redirects=*/redirects, + /*SecondsToWait=*/0, + /*MemoryLimit=*/0, + /*ErrMsg=*/message)) { + if (message->empty()) { + llvm::ErrorOr> maybeErrorlog = + llvm::MemoryBuffer::getFile(stderrFile); + *message = std::string("Invoking ptxas is failed, see the file: "); + if (maybeErrorlog) + *message += maybeErrorlog->get()->getBuffer().str(); + } + stderrRemover.releaseFile(); + return failure(); + } + + // Step 5. The output of ptxas if verbose flag is set. This is useful + // because it shows local memory usage, register usage, and etc. + if (dumpPtx) { + llvm::ErrorOr> maybeFlog = + llvm::MemoryBuffer::getFile(stderrFile); + if (maybeFlog) { + llvm::WithColor::note() << maybeFlog->get()->getBuffer().str(); + } + } + + // Step 6. Read the cubin file, and return. It will eventually be written + // into executable. + llvm::ErrorOr> maybeFcubin = + llvm::MemoryBuffer::getFile(cubinFile); + if (!maybeFcubin) { + *message = std::string("Could not read cubin file \n"); + return failure(); + } + + return std::string(maybeFcubin->get()->getBuffer()); +} + #define RETURN_ON_CUDA_ERROR(expr) \ do { \ if (auto status = (expr)) { \ @@ -54,11 +161,13 @@ SerializeToCubinPass(StringRef triple = "nvptx64-nvidia-cuda", StringRef chip = "sm_35", StringRef features = "+ptx60", - int optLevel = 2, bool dumpPtx = false); + int optLevel = 2, bool dumpPtx = false, + bool usePtxas = true, StringRef ptxasParams = {}); StringRef getArgument() const override { return "gpu-to-cubin"; } StringRef getDescription() const override { - return "Lower GPU kernel function to CUBIN binary annotations"; + return "Lower GPU kernel function to CUBIN binary " + "annotations"; } private: @@ -80,9 +189,10 @@ SerializeToCubinPass::SerializeToCubinPass(StringRef triple, StringRef chip, StringRef features, int optLevel, - bool dumpPtx) { - // No matter how this pass is constructed, ensure that the NVPTX backend - // is initialized exactly once. + bool dumpPtx, bool usePtxas, + StringRef ptxasParams) { + // No matter how this pass is constructed, ensure that + // the NVPTX backend is initialized exactly once. llvm::call_once(initializeBackendOnce, []() { // Initialize LLVM NVPTX backend. LLVMInitializeNVPTXTarget(); @@ -94,7 +204,9 @@ maybeSetOption(this->triple, triple); maybeSetOption(this->chip, chip); maybeSetOption(this->features, features); + maybeSetOption(this->ptxasParams, ptxasParams); this->dumpPtx = dumpPtx; + this->usePtxas = usePtxas; if (this->optLevel.getNumOccurrences() == 0) this->optLevel.setValue(optLevel); } @@ -112,7 +224,8 @@ RETURN_ON_CUDA_ERROR(cuInit(0)); - // Linking requires a device context. + // Linking requires a device + // context. CUdevice device; RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0)); CUcontext context; @@ -131,9 +244,24 @@ auto kernelName = getOperation().getName().str(); if (dumpPtx) { - llvm::dbgs() << " Kernel Name : [" << kernelName << "]\n"; - llvm::dbgs() << isa << "\n"; + llvm::errs() << "// Kernel Name : [" << kernelName << "]\n"; + llvm::errs() << isa << "\n"; } + + if (usePtxas) { + // Try to compile it with ptxas first. + std::string message; + FailureOr maybeCubinImage = + compileWithPtxas(this->chip, ptxasParams, isa, dumpPtx, &message); + if (succeeded(maybeCubinImage)) { + return std::make_unique>( + maybeCubinImage.value().begin(), maybeCubinImage.value().end()); + } + emitError(loc) << message; + return {}; + } + + // Fallback to JIT compilation if ptxas fails. RETURN_ON_CUDA_ERROR(cuLinkAddData( linkState, CUjitInputType::CU_JIT_INPUT_PTX, const_cast(static_cast(isa.c_str())), isa.length(), @@ -150,7 +278,7 @@ auto result = std::make_unique>(cubinAsChar, cubinAsChar + cubinSize); - // This will also destroy the cubin data. + // This will also destroy the cubin data. RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState)); RETURN_ON_CUDA_ERROR(cuCtxDestroy(context)); @@ -159,17 +287,22 @@ // Register pass to serialize GPU kernel functions to a CUBIN binary annotation. void mlir::registerGpuSerializeToCubinPass() { - PassRegistration registerSerializeToCubin( - [] { return std::make_unique(); }); + PassRegistration registerSerializeToCubin([] { + // Initialize LLVM NVPTX backend. + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTargetMC(); + LLVMInitializeNVPTXAsmPrinter(); + + return std::make_unique(); + }); } -std::unique_ptr mlir::createGpuSerializeToCubinPass(StringRef triple, - StringRef arch, - StringRef features, - int optLevel, - bool dumpPtx) { - return std::make_unique(triple, arch, features, - optLevel, dumpPtx); +std::unique_ptr mlir::createGpuSerializeToCubinPass( + const gpu::SerializationToCubinOptions &options) { + return std::make_unique( + options.triple, options.chip, options.features, options.optLevel, + options.dumpPtx, options.usePtxas, options.ptxasParams); } #else // MLIR_GPU_TO_CUBIN_PASS_ENABLE diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp --- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp +++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp @@ -102,8 +102,12 @@ // Finalize GPU code generation. if (gpuCodegen) { #if MLIR_GPU_TO_CUBIN_PASS_ENABLE - pm.addNestedPass(createGpuSerializeToCubinPass( - options.gpuTriple, options.gpuChip, options.gpuFeatures)); + gpu::SerializationToCubinOptions cubinOptions; + cubinOptions.triple = options.gpuTriple; + cubinOptions.chip = options.gpuChip; + cubinOptions.features = options.gpuFeatures; + pm.addNestedPass( + createGpuSerializeToCubinPass(cubinOptions)); #endif pm.addPass(createGpuToLLVMConversionPass()); } diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp --- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp +++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp @@ -172,8 +172,12 @@ pm.addNestedPass(createReconcileUnrealizedCastsPass()); #if MLIR_GPU_TO_CUBIN_PASS_ENABLE - pm.addNestedPass(createGpuSerializeToCubinPass( - options.cubinTriple, options.cubinChip, options.cubinFeatures)); + gpu::SerializationToCubinOptions cubinOptions; + cubinOptions.triple = options.cubinTriple; + cubinOptions.chip = options.cubinChip; + cubinOptions.features = options.cubinFeatures; + pm.addNestedPass( + createGpuSerializeToCubinPass(cubinOptions)); #endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE }