diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h --- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h +++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h @@ -9,6 +9,7 @@ #define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ #include "mlir/Dialect/GPU/Transforms/Utils.h" +#include "mlir/IR/Attributes.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/StringRef.h" #include @@ -51,9 +52,7 @@ /// populate converter for gpu types. void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, - StringRef gpuBinaryAnnotation = {}, bool kernelBarePtrCallConv = false); - } // namespace mlir #endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -358,21 +358,33 @@ This pass does not generate code to call GPU runtime APIs directly but instead uses a small wrapper library that exports a stable and conveniently typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP). + + Target option examples: + 1. Set the target of the module to AMDGPU, chip `gfx90a` and using fast + math. + ``` + --gpu-to-llvm='target="AMDGPU: chip="gfx90a", opts = {fast}"' + ``` + 2. Select the target `mytarget` from the existing attributes of the module. + ``` + --gpu-to-llvm='target=mytarget' + ``` }]; let options = [ Option<"kernelBarePtrCallConv", "use-bare-pointers-for-kernels", "bool", - /*default=*/"false", - "Use bare pointers to pass memref arguments to kernels. " - "The kernel must use the same setting for this option." - >, - Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string", - /*default=*/"gpu::getDefaultGpuBinaryAnnotation()", - "Annotation attribute string for GPU binary" - >, + /*default=*/"false", + "Use bare pointers to pass memref arguments to kernels. " + "The kernel must use the same setting for this option." + >, + Option<"gpuTarget", "target", "std::string", + /*default=*/"\"\"", + "Selects or sets a GPU target in the module for translation." + "For setting a target, this option must start and end with `\\\"`." + >, Option<"useOpaquePointers", "use-opaque-pointers", "bool", - /*default=*/"true", "Generate LLVM IR using opaque pointers " - "instead of typed pointers">, + /*default=*/"true", "Generate LLVM IR using opaque pointers " + "instead of typed pointers">, ]; let dependentDialects = [ diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -16,6 +16,7 @@ include "mlir/Dialect/DLTI/DLTIBase.td" include "mlir/Dialect/GPU/IR/GPUBase.td" include "mlir/Dialect/GPU/IR/ParallelLoopMapperAttr.td" +include "mlir/Dialect/GPU/IR/TranslationTargetAttr.td" include "mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td" include "mlir/IR/EnumAttr.td" include "mlir/IR/FunctionInterfaces.td" diff --git a/mlir/include/mlir/Dialect/GPU/IR/TranslationTargetAttr.td b/mlir/include/mlir/Dialect/GPU/IR/TranslationTargetAttr.td new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/IR/TranslationTargetAttr.td @@ -0,0 +1,95 @@ +//===-- TranslationTargetAttr.td - GPU translation target attribute -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines the translation target used to configure the serialization pipeline +// during translation of the GPU dialect. +// +//===----------------------------------------------------------------------===// + +#ifndef GPU_TRANSLATIONTARGET +#define GPU_TRANSLATIONTARGET + +include "mlir/Dialect/GPU/IR/GPUBase.td" + +def GPU_TranslationPipeline : I32EnumAttr<"TranslationPipeline", + "Pipeline to be used during GPU translation.", + [ + I32EnumAttrCase<"NVPTX", 0>, + I32EnumAttrCase<"AMDGPU", 1>, + ]>{ + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::gpu"; +} + +def GPU_TranslationPipelineAttr : EnumAttr; + +def GPU_TranslationTargetAttr: GPU_Attr<"TranslationTarget", "target"> { + let description = [{ + Target options used during translation. The only required parameter is the + translation pipeline, all other parameters decay into default values if not + present. The defaul values will depend on the chosen target. + + Examples: + + 1. NVPTX target with default values. + ``` + gpu.module attributes { target = #gpu.target } { + ... + } + ``` + 2. AMDGPU target, using target chip `gfx90a`, fast math options and no wave64. + ``` + gpu.module attributes { + target = #gpu.target + } { + ... + } + ``` + }]; + let parameters = (ins + AttrOrTypeParameter<"TranslationPipeline", "Pipeline used during translation.">:$pipeline, + DefaultValuedParameter<"int", "0", "Optimization level to apply.">:$O, + OptionalParameter<"StringAttr", "Target triple.">:$triple, + OptionalParameter<"StringAttr", "Target chip.">:$chip, + OptionalParameter<"StringAttr", "Chip features.">:$features, + OptionalParameter<"StringAttr", "Path to toolkit with device libraries.">:$toolkit, + OptionalParameter<"ArrayAttr", "Files to link with the device module.">:$link, + OptionalParameter<"Attribute", "Target specific options.">:$opts + ); + let assemblyFormat = [{ + `<`$pipeline (`:` struct($O, $triple, $chip, $features, $toolkit, $link, $opts)^)? `>` + }]; + let builders = [ + AttrBuilder<(ins "TranslationPipeline":$pipeline, + CArg<"int", "0">:$optLevel, + CArg<"StringRef", "{}">:$triple, + CArg<"StringRef", "{}">:$chip, + CArg<"StringRef", "{}">:$features, + CArg<"StringRef", "{}">:$toolkitPath, + CArg<"ArrayAttr", "{}">:$filesToLink, + CArg<"Attribute", "{}">:$pipelineOptions), [{ + auto getStrAttr = [&$_ctxt](StringRef str) { + return str.empty() ? StringAttr() : StringAttr::get($_ctxt, str); + }; + return Base::get($_ctxt, + pipeline, + optLevel, + getStrAttr(triple), + getStrAttr(chip), + getStrAttr(features), + getStrAttr(toolkitPath), + filesToLink, + pipelineOptions); + }]> + ]; + let genVerifyDecl = 1; +} + +#endif // GPU_TRANSLATIONTARGET diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -70,6 +70,14 @@ } namespace gpu { +/// Sets the target attribute used in translation. If target is `null` it +/// selects the module attribute with key `targetAttrName`, otherwise it sets +/// the attribute to `target`. Retuns failure if there's no attribute with key +/// `targetAttrName`. +LogicalResult selectOrSetTargetAttr(GPUModuleOp module, + StringRef targetAttrName, + TranslationTargetAttr target = {}); + /// Base pass class to serialize kernel functions through LLVM into /// user-specified IR and add the resulting blob as module attribute. class SerializeToBlobPass : public OperationPass { diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Utils.h b/mlir/include/mlir/Dialect/GPU/Transforms/Utils.h --- a/mlir/include/mlir/Dialect/GPU/Transforms/Utils.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Utils.h @@ -25,9 +25,22 @@ namespace gpu { class GPUFuncOp; class LaunchOp; +class GPUModuleOp; +class TranslationTargetAttr; /// Returns the default annotation name for GPU binary blobs. std::string getDefaultGpuBinaryAnnotation(); + +/// Returns the name of the target attribute. +StringRef getTargetAttrName(); + +/// Returns the name of the global variable to be used for storing the binary +/// annotation stub during the `--gpu-to-llvm` pass. +SmallString<128> getBinaryStorageStubName(StringRef moduleName); + +/// Returns the name of the global variable to be used for storing the binary +/// annotation during translation. +SmallString<128> getBinaryStorageName(StringRef moduleName); } // namespace gpu /// Get a gpu.func created from outlining the region of a gpu.launch op with the diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/GPU/GPUTranslationTargets.h b/mlir/include/mlir/Target/LLVMIR/Dialect/GPU/GPUTranslationTargets.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Target/LLVMIR/Dialect/GPU/GPUTranslationTargets.h @@ -0,0 +1,144 @@ +//===- GPUTranslationTargets.h - GPU Dialect translation targets --------*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This files provides interfaces for interacting with `TranslationTargetAttr`. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_TARGET_LLVMIR_DIALECT_GPU_GPUTRANSLATIONTARGETS_H +#define MLIR_TARGET_LLVMIR_DIALECT_GPU_GPUTRANSLATIONTARGETS_H + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/Transforms/Utils.h" + +#include "llvm/ADT/StringRef.h" + +namespace mlir { +namespace gpu { +/// Interface for all TranslationTarget* classes. +class TranslationTarget { +public: + /// Get the target triple, if null, return default triple. + StringRef getTargetTriple(); + + /// Get the target chip, if null, return default chip. + StringRef getTargetChip(); + + /// Get the target features, if null, return default features. + StringRef getTargetFeatures(); + + /// Get the toolkit path, if null, return the path inferred by CMake. + StringRef getToolkitPath(); + + /// Get the bitcode files for linking specified in the attribute. + SmallVector getFilesToLink(); + + /// Get the optimization level. + int getOptLevel(); + + /// Get the target specific options. + Attribute getTargetOptions(); + + /// If `opts` is a `DictionaryAttr`, try to return the attribute with name + /// `optionName`. + Attribute getTargetOption(StringRef optionName); + + /// If `opts` is a `DictionaryAttr`, try to return the attribute with name + /// `optionName` casted to `Ty`. + template + Ty getTargetOption(StringRef optionName) { + return dyn_cast_or_null(getTargetOption(optionName)); + } + + /// Checks if the fast math option was passed. + bool getFastMath(); + + /// Checks if the `ftz` option was passed. + bool getFtz(); + +protected: + TranslationTarget(TranslationTargetAttr target, StringRef defaultTriple, + StringRef defaultChip, StringRef defaultFeatures, + StringRef defaultToolkitPath); + + /// Translation target attribute. + TranslationTargetAttr target; + +private: + /// Default target triple. + const StringRef defaultTriple; + + /// Default target chip. + const StringRef defaultChip; + + /// Default target features. + const StringRef defaultFeatures; + + /// Default toolkit path. + const StringRef defaultToolkitPath; +}; + +/// Class for interacting with a translation target attribute for NVPTX targets, +/// e.g. `#gpu.target`. +class NVPTXTranslationTarget : public TranslationTarget { +public: + NVPTXTranslationTarget(TranslationTargetAttr target); + + /// Default target triple. + static constexpr llvm::StringLiteral kDefaultTriple = "nvptx64-nvidia-cuda"; + + /// Default target chip. + static constexpr llvm::StringLiteral kDefaultChip = "sm_35"; + + /// Default target features. + static constexpr llvm::StringLiteral kDefaultFeatures = "+ptx60"; + + /// Get the toolkit path inferred by CMake, or `""` if none was inferred. + static StringRef getDefaultToolkitPath(); +}; + +/// Class for interacting with a translation target attribute for AMDGPU +/// targets, e.g. `#gpu.target`. +class AMDGPUTranslationTarget : public TranslationTarget { +public: + AMDGPUTranslationTarget(TranslationTargetAttr target); + + /// Get wether to use the wave64 mode -it's enabled by default. + bool getWave64(); + + /// Checks if the `finite_only` option was passed. + bool getFiniteOnly(); + + /// Checks if the `unsafe_math` option was passed. + bool getUnsafeMath(); + + /// Checks if the `correct_sqrt` option was passed. + bool getCorrectSqrt(); + + /// Returns the `abi_ver` option. + StringRef getAbiVer(); + + /// Default target triple. + static constexpr llvm::StringLiteral kDefaultTriple = "amdgcn-amd-amdhsa"; + + /// Default target chip. + static constexpr llvm::StringLiteral kDefaultChip = "gfx600"; + + /// Default target features. + static constexpr llvm::StringLiteral kDefaultFeatures = ""; + + /// Default ABI version. + static constexpr llvm::StringLiteral kDefaultAbiVer = "400"; + + /// Get the toolkit path inferred by CMake, or `""` if none was inferred. + static StringRef getDefaultToolkitPath(); +}; +} // namespace gpu +} // namespace mlir + +#endif diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -15,6 +15,7 @@ #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" +#include "mlir/AsmParser/AsmParser.h" #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" @@ -45,8 +46,6 @@ using namespace mlir; -static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst"; - namespace { class GpuToLLVMConversionPass @@ -384,10 +383,8 @@ : public ConvertOpToGpuRuntimeCallPattern { public: ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter, - StringRef gpuBinaryAnnotation, bool kernelBarePtrCallConv) : ConvertOpToGpuRuntimeCallPattern(typeConverter), - gpuBinaryAnnotation(gpuBinaryAnnotation), kernelBarePtrCallConv(kernelBarePtrCallConv) {} private: @@ -400,22 +397,9 @@ matchAndRewrite(gpu::LaunchFuncOp launchOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; - llvm::SmallString<32> gpuBinaryAnnotation; bool kernelBarePtrCallConv; }; -class EraseGpuModuleOpPattern : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(gpu::GPUModuleOp op, - PatternRewriter &rewriter) const override { - // GPU kernel modules are no longer necessary since we have a global - // constant with the CUBIN, or HSACO data. - rewriter.eraseOp(op); - return success(); - } -}; - /// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime /// call. Currently it supports CUDA and ROCm (HIP). class ConvertMemcpyOpToGpuRuntimeCallPattern @@ -649,9 +633,77 @@ ConversionPatternRewriter &rewriter) const override; }; +// Parses a TranslationTargetAttr from a string. If parsing failed it returns +// failure on the first attribute. If the `attrStr` doesn't starts and ends with +// `"`, returns success and a nullptr. +std::pair +parseTargetAttr(StringRef attrStr, MLIRContext *context) { + // This method assumes that the body of the target attribute is surrounded by + // `"`. + if (attrStr.starts_with("\"") && attrStr.ends_with("\"")) { + attrStr = attrStr.ltrim("\"").rtrim("\""); + + // Empty attributes are not valid. + if (attrStr.empty()) + return {failure(), nullptr}; + + // Parse the attribute. + std::string attrTmp = "#gpu.target<" + attrStr.str() + ">"; + auto attr = dyn_cast_or_null( + parseAttribute(attrTmp, context)); + return {attr ? success() : failure(), attr}; + } + // If the method doesn't starts and ends with `"`, the method deduces the + // string is not a TranslationTarget. + return {success(), nullptr}; +} } // namespace +LogicalResult mlir::gpu::selectOrSetTargetAttr(GPUModuleOp module, + StringRef targetAttrName, + TranslationTargetAttr target) { + // If `target` is valid, set the attribute. + if (target) { + module->setAttr(getTargetAttrName(), target); + } else { + // Try selecting the attribute from the existing module attributes. + StringRef attrName = getTargetAttrName(); + if (attrName != targetAttrName) { + Attribute attr = module->removeAttr(targetAttrName); + if (!attr) { + module.emitError() << "`" << targetAttrName + << "` is not a valid attribute key."; + return failure(); + } + module->setAttr(attrName, attr); + } + } + return success(); +} + void GpuToLLVMConversionPass::runOnOperation() { + // If `gpuTarget` is not empty update all modules. + if (gpuTarget.size()) { + // Try to parse `gpuTarget` as an attribute, if the attribute is null and + // parsing succeeded it means the pattern will perform target selection + // instead of setting the attribute. + auto [parseStatus, targetAttr] = parseTargetAttr(gpuTarget, &getContext()); + if (failed(parseStatus)) { + getOperation().emitError() + << gpuTarget << " is not a valid target attribute."; + return signalPassFailure(); + } + std::string targetAttrName = gpuTarget; + if (targetAttr) + targetAttrName = gpu::getTargetAttrName(); + + // Update the target attribute in all nested `gpu.module`s. + for (auto op : getOperation().getBody()->getOps()) + if (failed(selectOrSetTargetAttr(op, targetAttrName, targetAttr))) + return signalPassFailure(); + } + + // Populate the conversion pass. LowerToLLVMOptions options(&getContext()); options.useOpaquePointers = useOpaquePointers; @@ -659,8 +711,6 @@ RewritePatternSet patterns(&getContext()); LLVMConversionTarget target(getContext()); - target.addIllegalDialect(); - mlir::arith::populateArithToLLVMConversionPatterns(converter, patterns); mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns); populateVectorToLLVMConversionPatterns(converter, patterns); @@ -668,7 +718,7 @@ populateFuncToLLVMConversionPatterns(converter, patterns); populateAsyncStructuralTypeConversionsAndLegality(converter, patterns, target); - populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation, + populateGpuToLLVMConversionPatterns(converter, patterns, kernelBarePtrCallConv); if (failed( @@ -1069,19 +1119,11 @@ launchOp, launchOp.getKernelModuleName()); assert(kernelModule && "expected a kernel module"); - auto binaryAttr = - kernelModule->getAttrOfType(gpuBinaryAnnotation); - if (!binaryAttr) { - kernelModule.emitOpError() - << "missing " << gpuBinaryAnnotation << " attribute"; - return failure(); - } - - SmallString<128> nameBuffer(kernelModule.getName()); - nameBuffer.append(kGpuBinaryStorageSuffix); + SmallString<128> nameBuffer = + gpu::getBinaryStorageStubName(kernelModule.getName()); Value data = LLVM::createGlobalString( - loc, rewriter, nameBuffer.str(), binaryAttr.getValue(), - LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers()); + loc, rewriter, nameBuffer.str(), "", LLVM::Linkage::Internal, + getTypeConverter()->useOpaquePointers()); auto module = moduleLoadCallBuilder.create(loc, rewriter, data); // Get the function from the module. The name corresponds to the name of @@ -1589,7 +1631,6 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, - StringRef gpuBinaryAnnotation, bool kernelBarePtrCallConv) { addOpaquePointerConversion(converter); addOpaquePointerConversion(converter); @@ -1623,6 +1664,5 @@ ConvertSDDMMBufferSizeOpToGpuRuntimeCallPattern, ConvertSDDMMOpToGpuRuntimeCallPattern>(converter); patterns.add( - converter, gpuBinaryAnnotation, kernelBarePtrCallConv); - patterns.add(&converter.getContext()); + converter, kernelBarePtrCallConv); } diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -13,6 +13,7 @@ #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/GPU/Transforms/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" @@ -34,6 +35,49 @@ #include "mlir/Dialect/GPU/IR/GPUOpsDialect.cpp.inc" +//===----------------------------------------------------------------------===// +// GPU Utility functions +//===----------------------------------------------------------------------===// +StringRef mlir::gpu::getTargetAttrName() { return "target"; } + +SmallString<128> mlir::gpu::getBinaryStorageStubName(StringRef moduleName) { + static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_stub"; + SmallString<128> name(moduleName); + name.append(kGpuBinaryStorageSuffix); + return name; +} + +SmallString<128> mlir::gpu::getBinaryStorageName(StringRef moduleName) { + static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst"; + SmallString<128> name(moduleName); + name.append(kGpuBinaryStorageSuffix); + return name; +} + +//===----------------------------------------------------------------------===// +// GPU Translation Attributes +//===----------------------------------------------------------------------===// + +::mlir::LogicalResult TranslationTargetAttr::verify( + ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, + TranslationPipeline pipeline, int optLevel, StringAttr triple, + StringAttr chip, StringAttr features, StringAttr toolkit, ArrayAttr link, + Attribute opts) { + if (optLevel > 3 || optLevel < 0) { + emitError() << "O" << optLevel << " is not a valid optimization level."; + return failure(); + } + if (link) { + for (Attribute attr : link.getValue()) { + if (!mlir::isa(attr)) { + emitError() << "All link values must be strings."; + return failure(); + } + } + } + return success(); +} + //===----------------------------------------------------------------------===// // GPU Device Mapping Attributes //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/AMDGPUPipeline.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/AMDGPUPipeline.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/AMDGPUPipeline.cpp @@ -0,0 +1,377 @@ +//===- NVPTXPipeline.cpp - GPU Dialect translation for NVPTX targets ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This files provides the serialization pipelines for NVPTX targets. +// +//===----------------------------------------------------------------------===// + +#include "ModuleToObject.h" +#include "TranslationPipelines.h" + +#include "mlir/Target/LLVMIR/ModuleTranslation.h" +#include "llvm/IR/Constants.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/TargetParser/TargetParser.h" + +using namespace mlir; +using namespace mlir::gpu; + +#if MLIR_ROCM_CONVERSIONS_ENABLED == 1 +namespace { +//===----------------------------------------------------------------------===// +// Base declarations. +//===----------------------------------------------------------------------===// + +// AMDGPU target initializer. +struct InitAMDGPUTarget { + InitAMDGPUTarget() { + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUAsmParser(); + LLVMInitializeAMDGPUAsmPrinter(); + } +}; +// This ensures that the target is initialized once. +llvm::ManagedStatic amdgpuTargetInit; + +// Base for all NVPTX serialization pipelines. +class AMDGPUPipelineBase : public ModuleToObject { +public: + AMDGPUPipelineBase(GPUModuleOp module, AMDGPUTranslationTarget target); + + // Return the translation target. + TranslationTarget &getTranslationTarget() override; + + // Get the paths of ROCm device libraries. Function adapted from: + // https://github.com/llvm/llvm-project/blob/main/clang/lib/Driver/ToolChains/AMDGPU.cpp + void getCommonBitcodeLibs(llvm::SmallVector &libs, + SmallVector &libPath, + StringRef isaVersion, bool wave64, bool daz, + bool finiteOnly, bool unsafeMath, bool fastMath, + bool correctSqrt, StringRef abiVer); + + // Implementation of ModuleToObject::loadBitcodeFiles, if the toolkit path is + // non empty it will try to load `libdevice` and err on failure. + std::optional>> + loadBitcodeFiles(llvm::LLVMContext &context, llvm::Module &module) override; + + // Removes unnecessary metadata from the loaded bitcode files. + void handleBitcodeFile(llvm::Module &module, + llvm::TargetMachine &targetMachine) override; + +protected: + AMDGPUTranslationTarget target; +}; + +//===----------------------------------------------------------------------===// +// Pipeline declarations. +//===----------------------------------------------------------------------===// + +// NVPTX pipeline using the driver to compile to cubin. +class AMDGPUPipeline : public AMDGPUPipelineBase { +public: + using AMDGPUPipelineBase::AMDGPUPipelineBase; + + // Assembles the object. + std::optional> assembleIsa(StringRef isa); + + // Create the HSACO object. + std::optional> createHsaco(SmallVector &&ptx); + + // Serializes the object. + std::optional> + serializeToObject(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) override; + + // Embeds the serialized object in the host module. + LogicalResult handleSerializedObject( + SmallVector object, llvm::IRBuilderBase &hostBuilder, + LLVM::ModuleTranslation &hostModuleTranslation) override; +}; +} // namespace + +//===----------------------------------------------------------------------===// +// Base pipeline methods. +//===----------------------------------------------------------------------===// + +AMDGPUPipelineBase::AMDGPUPipelineBase(GPUModuleOp module, + AMDGPUTranslationTarget target) + : ModuleToObject(module), target(target) { + *amdgpuTargetInit; +} + +TranslationTarget &AMDGPUPipelineBase::getTranslationTarget() { return target; } + +void AMDGPUPipelineBase::getCommonBitcodeLibs( + llvm::SmallVector &libs, SmallVector &libPath, + StringRef isaVersion, bool wave64, bool daz, bool finiteOnly, + bool unsafeMath, bool fastMath, bool correctSqrt, StringRef abiVer) { + auto addLib = [&](StringRef path) { + if (!llvm::sys::fs::is_regular_file(path)) { + getOperation().emitRemark() << "Bitcode library path: " << path + << " does not exist or is not a file.\n"; + return; + } + libs.push_back(path.str()); + }; + auto optLib = [](StringRef name, bool on) -> Twine { + return name + (on ? "_on" : "_off"); + }; + auto getLibPath = [&libPath](Twine lib) { + auto baseSize = libPath.size(); + llvm::sys::path::append(libPath, lib + ".bc"); + std::string path(StringRef(libPath.data(), libPath.size()).str()); + libPath.truncate(baseSize); + return path; + }; + + // Add ROCm device libraries. + addLib(getLibPath("ocml")); + addLib(getLibPath("ockl")); + addLib(getLibPath(optLib("oclc_daz_opt", daz))); + addLib(getLibPath(optLib("oclc_unsafe_math", unsafeMath || fastMath))); + addLib(getLibPath(optLib("oclc_finite_only", finiteOnly || fastMath))); + addLib(getLibPath(optLib("oclc_correctly_rounded_sqrt", correctSqrt))); + addLib(getLibPath(optLib("oclc_wavefrontsize64", wave64))); + addLib(getLibPath("oclc_isa_version_" + isaVersion)); + if (abiVer.size()) + addLib(getLibPath("oclc_abi_version_" + abiVer)); +} + +std::optional>> +AMDGPUPipelineBase::loadBitcodeFiles(llvm::LLVMContext &context, + llvm::Module &module) { + SmallVector fileList = target.getFilesToLink(); + + // Try loading device libraries from the ROCm toolkit installation. + StringRef pathRef = target.getToolkitPath(); + if (pathRef.size()) { + SmallVector path; + path.insert(path.begin(), pathRef.begin(), pathRef.end()); + llvm::sys::path::append(path, "amdgcn", "bitcode"); + pathRef = StringRef(path.data(), path.size()); + if (!llvm::sys::fs::is_directory(pathRef)) { + getOperation().emitRemark() << "ROCm amdgcn bitcode path: " << pathRef + << " does not exist or is not a directory."; + return std::nullopt; + } + StringRef isaVersion = llvm::AMDGPU::getArchNameAMDGCN( + llvm::AMDGPU::parseArchAMDGCN(target.getTargetChip())); + isaVersion.consume_front("gfx"); + getCommonBitcodeLibs(fileList, path, isaVersion, target.getWave64(), + target.getFtz(), target.getFiniteOnly(), + target.getUnsafeMath(), target.getFastMath(), + target.getCorrectSqrt(), target.getAbiVer()); + } + + SmallVector> bcFiles; + if (failed(loadBitcodeFilesFromList(context, fileList, bcFiles, true))) + return std::nullopt; + return bcFiles; +} + +void AMDGPUPipelineBase::handleBitcodeFile(llvm::Module &module, + llvm::TargetMachine &targetMachine) { + // Some ROCM builds don't strip this like they should + if (auto *openclVersion = module.getNamedMetadata("opencl.ocl.version")) + module.eraseNamedMetadata(openclVersion); + // Stop spamming us with clang version numbers + if (auto *ident = module.getNamedMetadata("llvm.ident")) + module.eraseNamedMetadata(ident); +} + +//===----------------------------------------------------------------------===// +// AMDGPU pipeline methods. +//===----------------------------------------------------------------------===// +#ifdef MLIR_GPU_TO_HSACO_TRANSLATION_ENABLED +#include "mlir/Support/FileUtilities.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCParser/MCTargetAsmParser.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/Program.h" + +std::optional> AMDGPUPipeline::assembleIsa(StringRef isa) { + auto loc = getOperation().getLoc(); + + StringRef targetTriple = target.getTargetTriple(); + StringRef chip = target.getTargetChip(); + StringRef features = target.getTargetFeatures(); + + SmallVector result; + llvm::raw_svector_ostream os(result); + + llvm::Triple triple(llvm::Triple::normalize(targetTriple)); + std::string error; + const llvm::Target *target = + llvm::TargetRegistry::lookupTarget(triple.normalize(), error); + if (!target) { + emitError(loc, Twine("failed to lookup target: ") + error); + return std::nullopt; + } + + llvm::SourceMgr srcMgr; + srcMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(isa), SMLoc()); + + const llvm::MCTargetOptions mcOptions; + std::unique_ptr mri( + target->createMCRegInfo(targetTriple)); + std::unique_ptr mai( + target->createMCAsmInfo(*mri, targetTriple, mcOptions)); + mai->setRelaxELFRelocations(true); + std::unique_ptr sti( + target->createMCSubtargetInfo(targetTriple, chip, features)); + + llvm::MCContext ctx(triple, mai.get(), mri.get(), sti.get(), &srcMgr, + &mcOptions); + std::unique_ptr mofi(target->createMCObjectFileInfo( + ctx, /*PIC=*/false, /*LargeCodeModel=*/false)); + ctx.setObjectFileInfo(mofi.get()); + + SmallString<128> cwd; + if (!llvm::sys::fs::current_path(cwd)) + ctx.setCompilationDir(cwd); + + std::unique_ptr mcStreamer; + std::unique_ptr mcii(target->createMCInstrInfo()); + + llvm::MCCodeEmitter *ce = target->createMCCodeEmitter(*mcii, ctx); + llvm::MCAsmBackend *mab = target->createMCAsmBackend(*sti, *mri, mcOptions); + mcStreamer.reset(target->createMCObjectStreamer( + triple, ctx, std::unique_ptr(mab), + mab->createObjectWriter(os), std::unique_ptr(ce), + *sti, mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible, + /*DWARFMustBeAtTheEnd*/ false)); + mcStreamer->setUseAssemblerInfoForParsing(true); + + std::unique_ptr parser( + createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai)); + std::unique_ptr tap( + target->createMCAsmParser(*sti, *parser, *mcii, mcOptions)); + + if (!tap) { + emitError(loc, "assembler initialization error"); + return {}; + } + + parser->setTargetParser(*tap); + parser->Run(false); + + return result; +} + +std::optional> +AMDGPUPipeline::createHsaco(SmallVector &&ptx) { + SmallVector isaBinary = std::move(ptx); + auto loc = getOperation().getLoc(); + + // Save the ISA binary to a temp file. + int tempIsaBinaryFd = -1; + SmallString<128> tempIsaBinaryFilename; + if (llvm::sys::fs::createTemporaryFile("kernel", "o", tempIsaBinaryFd, + tempIsaBinaryFilename)) { + emitError(loc, "temporary file for ISA binary creation error"); + return {}; + } + llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename); + llvm::raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true); + tempIsaBinaryOs << StringRef(isaBinary.data(), isaBinary.size()); + tempIsaBinaryOs.close(); + + // Create a temp file for HSA code object. + int tempHsacoFD = -1; + SmallString<128> tempHsacoFilename; + if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco", tempHsacoFD, + tempHsacoFilename)) { + emitError(loc, "temporary file for HSA code object creation error"); + return {}; + } + llvm::FileRemover cleanupHsaco(tempHsacoFilename); + + StringRef theRocmPath = target.getToolkitPath(); + llvm::SmallString<32> lldPath(theRocmPath); + llvm::sys::path::append(lldPath, "llvm", "bin", "ld.lld"); + int lldResult = llvm::sys::ExecuteAndWait( + lldPath, + {"ld.lld", "-shared", tempIsaBinaryFilename, "-o", tempHsacoFilename}); + if (lldResult != 0) { + emitError(loc, "lld invocation error"); + return {}; + } + + // Load the HSA code object. + auto hsacoFile = openInputFile(tempHsacoFilename); + if (!hsacoFile) { + emitError(loc, "read HSA code object from temp file error"); + return {}; + } + + StringRef buffer = hsacoFile->getBuffer(); + + return SmallVector(buffer.begin(), buffer.end()); +} + +std::optional> +AMDGPUPipeline::serializeToObject(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) { + std::optional serializedISA = + translateToISA(llvmModule, targetMachine); + + if (!serializedISA) { + getOperation().emitError() << "Failed translating the Module to ISA."; + return std::nullopt; + } + std::optional> assembledIsa = + assembleIsa(serializedISA.value()); + + if (!assembledIsa) { + getOperation().emitError() << "Failed during ISA assembling."; + return std::nullopt; + } + + return createHsaco(std::move(assembledIsa.value())); +} + +LogicalResult AMDGPUPipeline::handleSerializedObject( + SmallVector object, llvm::IRBuilderBase &hostBuilder, + LLVM::ModuleTranslation &hostModuleTranslation) { + return embedBinaryObject(object, hostBuilder, hostModuleTranslation); +} + +LogicalResult +mlir::gpu::runAMDGPUPipeline(GPUModuleOp module, AMDGPUTranslationTarget target, + llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + return AMDGPUPipeline(module, target).run(builder, moduleTranslation); +} +#endif +#endif + +#if MLIR_ROCM_CONVERSIONS_ENABLED == 0 || \ + !defined(MLIR_GPU_TO_HSACO_TRANSLATION_ENABLED) +LogicalResult runAMDGPUPipeline(GPUModuleOp module, + AMDGPUTranslationTarget target, + llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + return success(); +} +#endif diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt @@ -1,13 +1,115 @@ +if (MLIR_ENABLE_CUDA_CONVERSIONS) + set(NVPTX_LIBS + NVPTXCodeGen + NVPTXDesc + NVPTXInfo + ) +endif() + +if (MLIR_ENABLE_ROCM_CONVERSIONS) + set(AMDGPU_LIBS + IRReader + IPO + linker + MCParser + AMDGPUAsmParser + AMDGPUCodeGen + AMDGPUDesc + AMDGPUInfo + target + ) +endif() + add_mlir_translation_library(MLIRGPUToLLVMIRTranslation + AMDGPUPipeline.cpp GPUToLLVMIRTranslation.cpp + GPUTranslationTargets.cpp + ModuleToObject.cpp + NVPTXPipeline.cpp LINK_COMPONENTS Core + MC + ${NVPTX_LIBS} + ${AMDGPU_LIBS} LINK_LIBS PUBLIC MLIRIR + MLIRExecutionEngineUtils MLIRGPUDialect MLIRLLVMDialect MLIRSupport MLIRTargetLLVMIRExport ) + +if(MLIR_ENABLE_CUDA_RUNNER) + if(NOT MLIR_ENABLE_CUDA_CONVERSIONS) + message(SEND_ERROR + "Building mlir with cuda support requires the NVPTX backend") + endif() + + # Configure CUDA language support. Using check_language first allows us to + # give a custom error message. + include(CheckLanguage) + check_language(CUDA) + if (CMAKE_CUDA_COMPILER) + enable_language(CUDA) + else() + message(SEND_ERROR + "Building mlir with cuda support requires a working CUDA install") + endif() + + if (NOT DEFINED CUDAToolkit_ROOT) + find_package(CUDAToolkit) + get_filename_component(CUDAToolkit_ROOT ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE) + endif() + message(VERBOSE "MLIR Default CUDA toolkit path: ${CUDAToolkit_ROOT}") + + # Enable gpu to cubin translation. + target_compile_definitions(obj.MLIRGPUToLLVMIRTranslation + PRIVATE + MLIR_GPU_TO_CUBIN_TRANSLATION_ENABLED=1 + __DEFAULT_CUDATOOLKIT_PATH__="${CUDAToolkit_ROOT}" + ) + + # Add CUDA headers includes and the libcuda.so library. + target_include_directories(obj.MLIRGPUToLLVMIRTranslation + PRIVATE + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} + ) + + find_library(CUDA_DRIVER_LIBRARY cuda) + + target_link_libraries(MLIRGPUToLLVMIRTranslation + PRIVATE + MLIRNVVMToLLVMIRTranslation + ${CUDA_DRIVER_LIBRARY} + ) + +endif() + +if(MLIR_ENABLE_ROCM_CONVERSIONS) + if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)) + message(SEND_ERROR + "Building mlir with ROCm support requires the AMDGPU backend") + endif() + if (DEFINED ROCM_PATH) + set(DEFAULT_ROCM_PATH "${ROCM_PATH}" CACHE PATH "Fallback path to search for ROCm installs") + elseif(DEFINED ENV{ROCM_PATH}) + set(DEFAULT_ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Fallback path to search for ROCm installs") + else() + set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs") + endif() + message(VERBOSE "MLIR Default ROCM toolkit path: ${DEFAULT_ROCM_PATH}") + + target_compile_definitions(obj.MLIRGPUToLLVMIRTranslation + PRIVATE + __DEFAULT_ROCM_PATH__="${DEFAULT_ROCM_PATH}" + MLIR_GPU_TO_HSACO_TRANSLATION_ENABLED=1 + ) + + target_link_libraries(MLIRGPUToLLVMIRTranslation + PRIVATE + MLIRROCDLToLLVMIRTranslation + ) +endif() diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp --- a/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp @@ -9,14 +9,37 @@ // This file implements a translation between the MLIR GPU dialect and LLVM IR. // //===----------------------------------------------------------------------===// + #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" + +#include "TranslationPipelines.h" +#include "mlir/Target/LLVMIR/Dialect/GPU/GPUTranslationTargets.h" #include "mlir/Target/LLVMIR/LLVMTranslationInterface.h" +#include "llvm/ADT/TypeSwitch.h" using namespace mlir; +using namespace mlir::gpu; -namespace { +static LogicalResult +handleModuleOp(gpu::GPUModuleOp module, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + if (Attribute attr = module->removeAttr("target")) { + if (auto targetAttr = dyn_cast(attr)) { + TranslationPipeline pipeline = targetAttr.getPipeline(); + if (pipeline == TranslationPipeline::NVPTX) { + return runNVPTXDriverPipeline(module, + NVPTXTranslationTarget(targetAttr), + builder, moduleTranslation); + } else if (pipeline == TranslationPipeline::AMDGPU) { + return runAMDGPUPipeline(module, AMDGPUTranslationTarget(targetAttr), + builder, moduleTranslation); + } + } + } + return success(); +} +namespace { class GPUDialectLLVMIRTranslationInterface : public LLVMTranslationDialectInterface { public: @@ -24,13 +47,22 @@ LogicalResult convertOperation(Operation *op, llvm::IRBuilderBase &builder, - LLVM::ModuleTranslation &moduleTranslation) const override { - return isa(op) ? success() : failure(); - } + LLVM::ModuleTranslation &moduleTranslation) const override; }; - } // namespace +LogicalResult GPUDialectLLVMIRTranslationInterface::convertOperation( + Operation *op, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) const { + return llvm::TypeSwitch(op) + .Case([&](gpu::GPUModuleOp module) { + return handleModuleOp(module, builder, moduleTranslation); + }) + .Default([&](Operation *op) { + return op->emitError("unsupported GPU operation: ") << op->getName(); + }); +} + void mlir::registerGPUDialectTranslation(DialectRegistry ®istry) { registry.insert(); registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) { diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUTranslationTargets.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUTranslationTargets.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUTranslationTargets.cpp @@ -0,0 +1,134 @@ +//===- GPUTranslationTargets.cpp - GPU Dialect translation targets ------*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This files provides interfaces for interacting with `TranslationTargetAttr`. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Target/LLVMIR/Dialect/GPU/GPUTranslationTargets.h" + +using namespace mlir; +using namespace mlir::gpu; + +//===----------------------------------------------------------------------===// +// Toolkit paths if not defined by CMake. +//===----------------------------------------------------------------------===// + +#ifndef __DEFAULT_CUDATOOLKIT_PATH__ +#define __DEFAULT_CUDATOOLKIT_PATH__ "" +#endif + +#ifndef __DEFAULT_ROCM_PATH__ +#define __DEFAULT_ROCM_PATH__ "" +#endif + +//===----------------------------------------------------------------------===// +// TranslationTarget methods. +//===----------------------------------------------------------------------===// + +StringRef TranslationTarget::getTargetTriple() { + StringAttr triple = target.getTriple(); + return triple ? triple.getValue() : defaultTriple; +} + +StringRef TranslationTarget::getTargetChip() { + StringAttr chip = target.getChip(); + return chip ? chip.getValue() : defaultChip; +} + +StringRef TranslationTarget::getTargetFeatures() { + StringAttr features = target.getFeatures(); + return features ? features.getValue() : defaultFeatures; +} + +StringRef TranslationTarget::getToolkitPath() { + StringAttr toolkit = target.getToolkit(); + return toolkit ? toolkit.getValue() : defaultToolkitPath; +} + +SmallVector TranslationTarget::getFilesToLink() { + SmallVector fileList; + if (ArrayAttr files = target.getLink()) + for (auto attr : files.getValue()) + if (auto file = dyn_cast(attr)) + fileList.push_back(file.str()); + return fileList; +} + +int TranslationTarget::getOptLevel() { return target.getO(); } + +Attribute TranslationTarget::getTargetOptions() { return target.getOpts(); } + +Attribute TranslationTarget::getTargetOption(StringRef optionName) { + if (Attribute opts = target.getOpts()) + if (DictionaryAttr optsDict = dyn_cast(opts)) + return optsDict.get(optionName); + return nullptr; +} + +bool TranslationTarget::getFastMath() { + return getTargetOption("fast") != nullptr; +} + +bool TranslationTarget::getFtz() { return getTargetOption("ftz") != nullptr; } + +TranslationTarget::TranslationTarget(TranslationTargetAttr target, + StringRef defaultTriple, + StringRef defaultChip, + StringRef defaultFeatures, + StringRef defaultToolkitPath) + : target(target), defaultTriple(defaultTriple), defaultChip(defaultChip), + defaultFeatures(defaultFeatures), defaultToolkitPath(defaultToolkitPath) { + assert(target && "The target must be non null."); +} + +//===----------------------------------------------------------------------===// +// NVPTXTranslationTarget methods. +//===----------------------------------------------------------------------===// + +NVPTXTranslationTarget::NVPTXTranslationTarget(TranslationTargetAttr target) + : TranslationTarget(target, kDefaultTriple, kDefaultChip, kDefaultFeatures, + getDefaultToolkitPath()) {} + +StringRef NVPTXTranslationTarget::getDefaultToolkitPath() { + return __DEFAULT_CUDATOOLKIT_PATH__; +} + +//===----------------------------------------------------------------------===// +// AMDGPUTranslationTarget methods. +//===----------------------------------------------------------------------===// + +AMDGPUTranslationTarget::AMDGPUTranslationTarget(TranslationTargetAttr target) + : TranslationTarget(target, kDefaultTriple, kDefaultChip, kDefaultFeatures, + getDefaultToolkitPath()) {} + +StringRef AMDGPUTranslationTarget::getDefaultToolkitPath() { + return __DEFAULT_ROCM_PATH__; +} + +bool AMDGPUTranslationTarget::getWave64() { + return getTargetOption("wave64") != nullptr || + getTargetOption("noWave64") == nullptr; +} + +bool AMDGPUTranslationTarget::getFiniteOnly() { + return getTargetOption("finite_only") != nullptr; +} + +bool AMDGPUTranslationTarget::getUnsafeMath() { + return getTargetOption("unsafe_math") != nullptr; +} + +bool AMDGPUTranslationTarget::getCorrectSqrt() { + return getTargetOption("correct_sqrt") != nullptr; +} + +StringRef AMDGPUTranslationTarget::getAbiVer() { + auto abiVer = getTargetOption("abi_ver"); + return abiVer ? abiVer.getValue() : StringRef(kDefaultAbiVer); +} diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/ModuleToObject.h b/mlir/lib/Target/LLVMIR/Dialect/GPU/ModuleToObject.h new file mode 100644 --- /dev/null +++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/ModuleToObject.h @@ -0,0 +1,126 @@ +//===- ModuleToObject.h - GPU Module to object base class -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the base class for transforming GPUModuleOps into binary +// annotations. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_TARGET_LLVMIR_DIALECT_GPU_MODULETOOBJECT_H +#define MLIR_TARGET_LLVMIR_DIALECT_GPU_MODULETOOBJECT_H + +#include "mlir/Target/LLVMIR/Dialect/GPU/GPUTranslationTargets.h" + +#include "llvm/IR/Module.h" + +namespace llvm { +class IRBuilderBase; +class TargetMachine; +} // namespace llvm + +namespace mlir { +namespace LLVM { +class ModuleTranslation; +} +namespace gpu { +static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst"; + +// Base class for all GPUToObject* translations. +class ModuleToObject { +public: + ModuleToObject(GPUModuleOp module); + virtual ~ModuleToObject() = default; + + // Returns the gpu.module being serialized. + GPUModuleOp getOperation(); + + // Returns the translation target. + virtual TranslationTarget &getTranslationTarget() = 0; + + // Runs the serialization pipeline, returning failure on error. + LogicalResult run(llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation); + +protected: + // Hooks to be implemented by derived classes. + + // Hook for loading bitcode files, returns std::nullopt on failure. + virtual std::optional>> + loadBitcodeFiles(llvm::LLVMContext &context, llvm::Module &module) { + return SmallVector>(); + } + + // Hook for performing additional actions on a loaded bitcode file. + virtual void handleBitcodeFile(llvm::Module &module, + llvm::TargetMachine &targetMachine) {} + + // Hook for performing additional actions on the llvmModule pre linking. + virtual void handleModulePreLink(llvm::Module &module, + llvm::TargetMachine &targetMachine) {} + + // Hook for performing additional actions on the llvmModule post linking. + virtual void handleModulePostLink(llvm::Module &module, + llvm::TargetMachine &targetMachine) {} + + // Serializes the LLVM IR bitcode to an object file. + virtual std::optional> + serializeToObject(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) { + return {}; + } + + // Hook for performing actions on the serialized object and the host module. + virtual LogicalResult + handleSerializedObject(SmallVector object, + llvm::IRBuilderBase &hostBuilder, + LLVM::ModuleTranslation &hostModuleTranslation) = 0; + +protected: + // Create the target machine based on the target triple and chip. + std::unique_ptr createTargetMachine(); + + // Loads a bitcode file from path. + std::unique_ptr loadBitcodeFile(llvm::LLVMContext &context, + StringRef path); + + // Loads multiple bitcode files. + virtual LogicalResult loadBitcodeFilesFromList( + llvm::LLVMContext &context, ArrayRef fileList, + SmallVector> &llvmModules, + bool failureOnError = true); + + // Translates the gpu.module to LLVM IR. + std::unique_ptr + translateToLLVMIR(llvm::LLVMContext &llvmContext); + + // Link the llvmModule to other bitcode file. + LogicalResult linkFiles(llvm::Module &module, + SmallVector> &&libs); + + // Optimize the module. + LogicalResult optimizeModule(llvm::Module &module, + llvm::TargetMachine &targetMachine, + int optLevel = 3); + + // Utility function for translating to ISA, returns `std::nullopt` on failure. + static std::optional + translateToISA(llvm::Module &llvmModule, llvm::TargetMachine &targetMachine); + + // Utility function for embedding the binary object as a global constant + // string for regular serialization pipelines. + LogicalResult + embedBinaryObject(SmallVector object, llvm::IRBuilderBase &hostBuilder, + LLVM::ModuleTranslation &hostModuleTranslation); + +private: + GPUModuleOp module; +}; +} // namespace gpu +} // namespace mlir + +#endif diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/ModuleToObject.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/ModuleToObject.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/ModuleToObject.cpp @@ -0,0 +1,256 @@ +//===- ModuleToObject.cpp - GPU Module to object base class ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the base class for transforming GPUModuleOps into binary +// annotations. +// +//===----------------------------------------------------------------------===// + +#include "ModuleToObject.h" + +#include "mlir/ExecutionEngine/OptUtils.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Export.h" +#include "mlir/Target/LLVMIR/ModuleTranslation.h" + +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/TargetParser/TargetParser.h" +#include "llvm/Transforms/IPO/Internalize.h" + +using namespace mlir; +using namespace mlir::gpu; + +ModuleToObject::ModuleToObject(GPUModuleOp module) : module(module) { + assert(module && "The module must be non null."); +} + +GPUModuleOp ModuleToObject::getOperation() { return module; } + +std::unique_ptr ModuleToObject::createTargetMachine() { + TranslationTarget &translationTarget = getTranslationTarget(); + std::string triple = translationTarget.getTargetTriple().str(); + std::string error; + // Load the target. + const llvm::Target *target = + llvm::TargetRegistry::lookupTarget(triple, error); + if (!target) { + getOperation().emitError() << "Failed to lookup target: " << error; + return {}; + } + + // Create the target machine using the target. + llvm::TargetMachine *machine = target->createTargetMachine( + triple, translationTarget.getTargetChip(), + translationTarget.getTargetFeatures(), {}, {}); + if (!machine) { + getOperation().emitError() << "Failed to create target machine"; + return {}; + } + return std::unique_ptr{machine}; +} + +std::unique_ptr +ModuleToObject::loadBitcodeFile(llvm::LLVMContext &context, StringRef path) { + llvm::SMDiagnostic error; + std::unique_ptr library = + llvm::getLazyIRFileModule(path, error, context); + if (!library) { + getOperation().emitError() << "Failed loading file from " << path + << ", error: " << error.getMessage(); + return nullptr; + } + return library; +} + +LogicalResult ModuleToObject::loadBitcodeFilesFromList( + llvm::LLVMContext &context, ArrayRef fileList, + SmallVector> &llvmModules, + bool failureOnError) { + for (const std::string &str : fileList) { + // Test if the path exists, if it doesn't abort. + StringRef pathRef = StringRef(str.data(), str.size()); + if (!llvm::sys::fs::is_regular_file(pathRef)) { + getOperation().emitError() + << "File path: " << pathRef << " does not exist or is not a file.\n"; + return failure(); + } + // Load the file or abort on error. + if (auto bcFile = loadBitcodeFile(context, pathRef)) + llvmModules.push_back(std::move(bcFile)); + else if (failureOnError) + return failure(); + } + return success(); +} + +std::unique_ptr +ModuleToObject::translateToLLVMIR(llvm::LLVMContext &llvmContext) { + return translateModuleToLLVMIR(getOperation(), llvmContext, + "GPUDialectModule"); +} + +LogicalResult +ModuleToObject::linkFiles(llvm::Module &module, + SmallVector> &&libs) { + if (libs.empty()) + return success(); + llvm::Linker linker(module); + for (std::unique_ptr &libModule : libs) { + // This bitcode linking code is substantially similar to what is used in + // hip-clang It imports the library functions into the module, allowing LLVM + // optimization passes (which must run after linking) to optimize across the + // libraries and the module's code. We also only import symbols if they are + // referenced by the module or a previous library since there will be no + // other source of references to those symbols in this compilation and since + // we don't want to bloat the resulting code object. + bool err = linker.linkInModule( + std::move(libModule), llvm::Linker::Flags::LinkOnlyNeeded, + [](llvm::Module &m, const StringSet<> &gvs) { + llvm::internalizeModule(m, [&gvs](const llvm::GlobalValue &gv) { + return !gv.hasName() || (gvs.count(gv.getName()) == 0); + }); + }); + // True is linker failure + if (err) { + getOperation().emitError("Unrecoverable failure during bitcode linking."); + // We have no guaranties about the state of `ret`, so bail + return failure(); + } + } + return success(); +} + +LogicalResult ModuleToObject::optimizeModule(llvm::Module &module, + llvm::TargetMachine &targetMachine, + int optLevel) { + if (optLevel < 0 || optLevel > 3) + return getOperation().emitError() + << "Invalid optimization level" << optLevel << "\n"; + + targetMachine.setOptLevel(static_cast(optLevel)); + + auto transformer = + makeOptimizingTransformer(optLevel, /*sizeLevel=*/0, &targetMachine); + auto error = transformer(&module); + if (error) { + InFlightDiagnostic mlirError = getOperation().emitError(); + llvm::handleAllErrors( + std::move(error), [&mlirError](const llvm::ErrorInfoBase &ei) { + mlirError << "Could not optimize LLVM IR: " << ei.message() << "\n"; + }); + return mlirError; + } + return success(); +} + +std::optional +ModuleToObject::translateToISA(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) { + std::string targetISA; + llvm::raw_string_ostream stream(targetISA); + + { // Drop pstream after this to prevent the ISA from being stuck buffering + llvm::buffer_ostream pstream(stream); + llvm::legacy::PassManager codegenPasses; + + if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr, + llvm::CGFT_AssemblyFile)) + return std::nullopt; + + codegenPasses.run(llvmModule); + } + return stream.str(); +} + +LogicalResult ModuleToObject::embedBinaryObject( + SmallVector object, llvm::IRBuilderBase &hostBuilder, + LLVM::ModuleTranslation &hostModuleTranslation) { + llvm::Module *hostModule = hostModuleTranslation.getLLVMModule(); + assert(hostModule && "The host module can't be null."); + + // Get the `gpu.module` name. + SmallString<128> nameBuffer = + getBinaryStorageStubName(getOperation().getName()); + if (llvm::GlobalVariable *gv = + hostModule->getGlobalVariable(nameBuffer, true)) { + // Create the new global variable with the serialized object. + nameBuffer = getBinaryStorageName(getOperation().getName()); + llvm::GlobalVariable *serializedObj = hostBuilder.CreateGlobalString( + StringRef(object.data(), object.size()), nameBuffer, 0, hostModule); + serializedObj->setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage); + serializedObj->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); + serializedObj->setAlignment(llvm::MaybeAlign(8)); + + // Update all uses to point to the correct global variable. + gv->replaceAllUsesWith(serializedObj); + hostModule->eraseGlobalVariable(gv); + } else { + getOperation().emitError() + << "There's no global variable for embedding this module."; + return failure(); + } + return success(); +} + +LogicalResult +ModuleToObject::run(llvm::IRBuilderBase &hostBuilder, + LLVM::ModuleTranslation &hostModuleTranslation) { + TranslationTarget &translationTarget = getTranslationTarget(); + // Translate the GPUModule to LLVM IR. + llvm::LLVMContext llvmContext; + std::unique_ptr llvmModule = translateToLLVMIR(llvmContext); + if (!llvmModule) { + getOperation().emitError() << "Failed creating the llvm::Module."; + return failure(); + } + + // Create the target machine. + std::unique_ptr targetMachine = createTargetMachine(); + if (!targetMachine) + return failure(); + + // Set the data layout and target triple of the module. + llvmModule->setDataLayout(targetMachine->createDataLayout()); + llvmModule->setTargetTriple(targetMachine->getTargetTriple().getTriple()); + + // Link bitcode files. + handleModulePreLink(*llvmModule, *targetMachine); + { + auto libs = loadBitcodeFiles(llvmContext, *llvmModule); + if (!libs) + return failure(); + if (libs->size()) + if (failed(linkFiles(*llvmModule, std::move(*libs)))) + return failure(); + handleModulePostLink(*llvmModule, *targetMachine); + } + + // Optimize the module. + int optLevel = translationTarget.getOptLevel(); + if (failed(optimizeModule(*llvmModule, *targetMachine, optLevel))) + return failure(); + + // Perform additional manipulations on the serialized object. + std::optional> object = + serializeToObject(*llvmModule, *targetMachine); + if (!object) { + getOperation().emitError() << "Failed while serializing the module."; + return failure(); + } + return handleSerializedObject(object.value(), hostBuilder, + hostModuleTranslation); +} diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/NVPTXPipeline.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/NVPTXPipeline.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/NVPTXPipeline.cpp @@ -0,0 +1,228 @@ +//===- NVPTXPipeline.cpp - GPU Dialect translation for NVPTX targets ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This files provides the serialization pipelines for NVPTX targets. +// +//===----------------------------------------------------------------------===// + +#include "ModuleToObject.h" +#include "TranslationPipelines.h" + +#include "llvm/IR/Constants.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/TargetSelect.h" + +using namespace mlir; +using namespace mlir::gpu; + +#if MLIR_CUDA_CONVERSIONS_ENABLED == 1 +namespace { +//===----------------------------------------------------------------------===// +// Base declarations. +//===----------------------------------------------------------------------===// + +// NVPTX target initializer. +struct InitNVPTXTarget { + InitNVPTXTarget() { + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTargetMC(); + LLVMInitializeNVPTXAsmPrinter(); + } +}; +// This ensures that the target is initialized once. +llvm::ManagedStatic nvptxTargetInit; + +// Base for all NVPTX serialization pipelines. +class NVPTXPipelineBase : public ModuleToObject { +public: + NVPTXPipelineBase(GPUModuleOp module, NVPTXTranslationTarget target); + + // Return the translation target. + TranslationTarget &getTranslationTarget() override; + + // Implementation of ModuleToObject::loadBitcodeFiles, if the toolkit path is + // non empty it will try to load `libdevice` and err on failure. + std::optional>> + loadBitcodeFiles(llvm::LLVMContext &context, llvm::Module &module) override; + +protected: + NVPTXTranslationTarget target; +}; + +//===----------------------------------------------------------------------===// +// Pipeline declarations. +//===----------------------------------------------------------------------===// + +// NVPTX pipeline using the driver to compile to cubin. +class NVPTXDriverPipeline : public NVPTXPipelineBase { +public: + using NVPTXPipelineBase::NVPTXPipelineBase; + + // Serializes the object using the JIT in the CUDA driver. + std::optional> + serializeToObject(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) override; + + // Embeds the serialized object in the host module. + LogicalResult handleSerializedObject( + SmallVector object, llvm::IRBuilderBase &hostBuilder, + LLVM::ModuleTranslation &hostModuleTranslation) override; +}; +} // namespace + +//===----------------------------------------------------------------------===// +// Base pipeline methods. +//===----------------------------------------------------------------------===// + +NVPTXPipelineBase::NVPTXPipelineBase(GPUModuleOp module, + NVPTXTranslationTarget target) + : ModuleToObject(module), target(target) { + *nvptxTargetInit; +} + +TranslationTarget &NVPTXPipelineBase::getTranslationTarget() { return target; } + +std::optional>> +NVPTXPipelineBase::loadBitcodeFiles(llvm::LLVMContext &context, + llvm::Module &module) { + SmallVector fileList = target.getFilesToLink(); + + // Try loading `libdevice` from a CUDA toolkit installation. + StringRef pathRef = target.getToolkitPath(); + if (pathRef.size()) { + SmallVector path; + path.insert(path.begin(), pathRef.begin(), pathRef.end()); + pathRef = StringRef(path.data(), path.size()); + if (!llvm::sys::fs::is_directory(pathRef)) { + getOperation().emitError() << "CUDA path: " << pathRef + << " does not exist or is not a directory.\n"; + return std::nullopt; + } + // TODO remove this hard coded path. + llvm::sys::path::append(path, "nvvm", "libdevice", "libdevice.10.bc"); + pathRef = StringRef(path.data(), path.size()); + if (!llvm::sys::fs::is_regular_file(pathRef)) { + getOperation().emitError() << "LibDevice path: " << pathRef + << " does not exist or is not a file.\n"; + return std::nullopt; + } + fileList.push_back(pathRef.str()); + } + + SmallVector> bcFiles; + if (failed(loadBitcodeFilesFromList(context, fileList, bcFiles, true))) + return std::nullopt; + return bcFiles; +} + +//===----------------------------------------------------------------------===// +// Driver pipeline methods. +//===----------------------------------------------------------------------===// +#ifdef MLIR_GPU_TO_CUBIN_TRANSLATION_ENABLED +#include + +static void emitCudaError(const llvm::Twine &expr, const char *buffer, + CUresult result, Location loc) { + const char *error; + cuGetErrorString(result, &error); + emitError(loc, expr.concat(" failed with error code ") + .concat(llvm::Twine{error}) + .concat("[") + .concat(buffer) + .concat("]")); +} + +#define RETURN_ON_CUDA_ERROR(expr) \ + do { \ + if (auto status = (expr)) { \ + emitCudaError(#expr, jitErrorBuffer, status, loc); \ + return {}; \ + } \ + } while (false) + +std::optional> +NVPTXDriverPipeline::serializeToObject(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) { + std::optional serializedISA = + translateToISA(llvmModule, targetMachine); + if (serializedISA) { + auto loc = getOperation().getLoc(); + char jitErrorBuffer[4096] = {0}; + + RETURN_ON_CUDA_ERROR(cuInit(0)); + + // Linking requires a device context. + CUdevice device; + RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0)); + CUcontext context; + RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device)); + CUlinkState linkState; + + CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES}; + void *jitOptionsVals[] = {jitErrorBuffer, + reinterpret_cast(sizeof(jitErrorBuffer))}; + + RETURN_ON_CUDA_ERROR(cuLinkCreate(2, /* number of jit options */ + jitOptions, /* jit options */ + jitOptionsVals, /* jit option values */ + &linkState)); + + auto kernelName = getOperation().getName().str(); + RETURN_ON_CUDA_ERROR(cuLinkAddData( + linkState, CUjitInputType::CU_JIT_INPUT_PTX, + const_cast(static_cast(serializedISA->c_str())), + serializedISA->length(), kernelName.c_str(), + 0, /* number of jit options */ + nullptr, /* jit options */ + nullptr /* jit option values */ + )); + + void *cubinData; + size_t cubinSize; + RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize)); + + char *cubinAsChar = static_cast(cubinData); + auto result = SmallVector(cubinAsChar, cubinAsChar + cubinSize); + + // This will also destroy the cubin data. + RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState)); + RETURN_ON_CUDA_ERROR(cuCtxDestroy(context)); + return result; + } else { + getOperation().emitError() << "Failed translating the Module to ISA."; + return std::nullopt; + } +} + +LogicalResult NVPTXDriverPipeline::handleSerializedObject( + SmallVector object, llvm::IRBuilderBase &hostBuilder, + LLVM::ModuleTranslation &hostModuleTranslation) { + return embedBinaryObject(object, hostBuilder, hostModuleTranslation); +} + +LogicalResult mlir::gpu::runNVPTXDriverPipeline( + GPUModuleOp module, NVPTXTranslationTarget target, + llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { + return NVPTXDriverPipeline(module, target).run(builder, moduleTranslation); +} + +#endif +#endif + +#if MLIR_CUDA_CONVERSIONS_ENABLED == 0 || \ + !defined(MLIR_GPU_TO_CUBIN_TRANSLATION_ENABLED) +LogicalResult mlir::gpu::runNVPTXDriverPipeline( + GPUModuleOp module, NVPTXTranslationTarget target, + llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { + return success(); +} +#endif diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/TranslationPipelines.h b/mlir/lib/Target/LLVMIR/Dialect/GPU/TranslationPipelines.h new file mode 100644 --- /dev/null +++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/TranslationPipelines.h @@ -0,0 +1,40 @@ +//===- TranslationPipelines.h - GPU translation pipelines -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares all available GPU translation pipelines. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_TARGET_LLVMIR_DIALECT_GPU_TRANSLATIONPIPELINES_H +#define MLIR_TARGET_LLVMIR_DIALECT_GPU_TRANSLATIONPIPELINES_H + +#include "mlir/Target/LLVMIR/Dialect/GPU/GPUTranslationTargets.h" + +namespace llvm { +class IRBuilderBase; +} + +namespace mlir { +namespace LLVM { +class ModuleTranslation; +} + +namespace gpu { +LogicalResult runAMDGPUPipeline(GPUModuleOp module, + AMDGPUTranslationTarget target, + llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation); + +LogicalResult +runNVPTXDriverPipeline(GPUModuleOp module, NVPTXTranslationTarget target, + llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation); +} // namespace gpu +} // namespace mlir + +#endif diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir @@ -1,14 +1,17 @@ -// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin use-opaque-pointers=1" | FileCheck %s -// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco use-opaque-pointers=1" | FileCheck %s --check-prefix=ROCDL +// RUN: mlir-opt %s --gpu-to-llvm="target=nvvm.cubin use-opaque-pointers=1" | FileCheck %s +// RUN: mlir-opt %s --gpu-to-llvm="target=rocdl.hsaco use-opaque-pointers=1" | FileCheck %s --check-prefix=ROCDL +// RUN: mlir-opt %s --gpu-to-llvm='target="NVPTX: chip = "sm_70", opts = {fast}" use-opaque-pointers=1' | FileCheck %s --check-prefix=NVVM module attributes {gpu.container_module} { // CHECK: llvm.mlir.global internal constant @[[KERNEL_NAME:.*]]("kernel\00") - // CHECK: llvm.mlir.global internal constant @[[GLOBAL:.*]]("CUBIN") - // ROCDL: llvm.mlir.global internal constant @[[GLOBAL:.*]]("HSACO") + // CHECK: llvm.mlir.global internal constant @[[GLOBAL:.*]]("") + // CHECK: gpu.module @kernel_module attributes {rocdl.hsaco = #gpu.target, target = #gpu.target} + // ROCDL: gpu.module @kernel_module attributes {nvvm.cubin = #gpu.target, target = #gpu.target} + // NVVM: gpu.module @kernel_module attributes {nvvm.cubin = #gpu.target, rocdl.hsaco = #gpu.target, target = #gpu.target} gpu.module @kernel_module attributes { - nvvm.cubin = "CUBIN", rocdl.hsaco = "HSACO" + nvvm.cubin = #gpu.target, rocdl.hsaco = #gpu.target } { llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: i64, %arg4: i64,