diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h --- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h +++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h @@ -40,6 +40,8 @@ #define GEN_PASS_DECL_GPUTOLLVMCONVERSIONPASS #include "mlir/Conversion/Passes.h.inc" +#define GEN_PASS_DECL_GPUTOOFFLOADCONVERSIONPASS +#include "mlir/Conversion/Passes.h.inc" using OwnedBlob = std::unique_ptr>; using BlobGenerator = @@ -54,6 +56,18 @@ StringRef gpuBinaryAnnotation = {}, bool kernelBarePtrCallConv = false); +/// Collect a set of patterns to convert from the GPU dialect to LLVM with +/// offload annotations and populate the converter for gpu types. +void populateGpuToLLVMOffloadConversionPatterns( + LLVMTypeConverter &converter, RewritePatternSet &patterns, + bool kernelBarePtrCallConv = false); + +/// Utility function that concatenates all offloading objects in a module +/// into a single object and adds it at the start of the module. If there are no +/// gpu.modules in the module it doesn't add anything to the module. Returns +/// failure if a gpu.module doesn't contain a valid offload annotation. +LogicalResult makeLLVMOffloadObject(ModuleOp module, + LLVMTypeConverter &converter); } // namespace mlir #endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -381,6 +381,39 @@ ]; } +def GpuToOffloadConversionPass : Pass<"gpu-to-offload", "ModuleOp"> { + let summary = "Convert GPU dialect to LLVM dialect with offload annotations."; + + let description = [{ + Creates a pass to convert a GPU operations into a sequence of GPU runtime + calls and adds clang compatible offload annotations for clang's new offload + driver. + + This pass does not generate code to call GPU runtime APIs directly but + instead uses a small wrapper library that exports a stable and conveniently + typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP). + For more information: + https://clang.llvm.org/docs/OffloadingDesign.html + https://clang.llvm.org/docs/ClangOffloadPackager.html + }]; + + let options = [ + Option<"kernelBarePtrCallConv", "use-bare-pointers-for-kernels", "bool", + /*default=*/"false", + "Use bare pointers to pass memref arguments to kernels. " + "The kernel must use the same setting for this option." + >, + Option<"useOpaquePointers", "use-opaque-pointers", "bool", + /*default=*/"true", "Generate LLVM IR using opaque pointers " + "instead of typed pointers">, + ]; + + let dependentDialects = [ + "::mlir::LLVM::LLVMDialect", + "memref::MemRefDialect", + ]; +} + def LowerHostCodeToLLVMPass : Pass<"lower-host-to-llvm", "ModuleOp"> { let summary = "Lowers the host module code and `gpu.launch_func` to LLVM"; diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -31,6 +31,14 @@ class FuncOp; } // namespace func +namespace gpu { +/// Returns the default cuda toolkit path, none if it wasn't found. +StringRef getDefaultCudaToolkitPath(); + +/// Returns the default ROCm path. +StringRef getDefaultRocmPath(); +} // namespace gpu + #define GEN_PASS_DECL #include "mlir/Dialect/GPU/Transforms/Passes.h.inc" @@ -62,6 +70,33 @@ } namespace gpu { +/// Define gpu offload kinds. +enum class OffloadKind { unk = -1, cuda, hip }; + +/// Convert a string to offload kind. +inline OffloadKind getOffloadKind(StringRef kind) { + if (kind == "cuda") + return OffloadKind::cuda; + else if (kind == "hip") + return OffloadKind::hip; + return OffloadKind::unk; +} + +/// Get the string representation of an offload kind. +inline StringRef fromOffloadKind(OffloadKind kind) { + if (kind == OffloadKind::cuda) + return "cuda"; + else if (kind == OffloadKind::hip) + return "hip"; + return "unk"; +} + +/// Returns the annotation name for GPU offload object blobs. +StringRef getGpuOffloadObjectAnnotation(); + +/// Returns the annotation name for GPU offload kind. +StringRef getGpuOffloadKindAnnotation(); + /// Base pass class to serialize kernel functions through LLVM into /// user-specified IR and add the resulting blob as module attribute. class SerializeToBlobPass : public OperationPass { diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td @@ -37,4 +37,101 @@ let dependentDialects = ["mlir::gpu::GPUDialect"]; } +def GpuNameMangling: Pass<"gpu-name-mangling", + "::mlir::ModuleOp"> { + let summary = "Mangle the names of all the top symbols inside a gpu.module."; + let description = [{ + Mangle the names of all the top level definitions inside a `gpu.module` + for all the `gpu.module`s inside a `module`, from: `` to: + `__G_S`, and updates all the symbol references. + }]; + let dependentDialects = ["mlir::gpu::GPUDialect"]; +} + +def GpuToNVPTXOffload: Pass<"gpu-to-nvptx", + "::mlir::gpu::GPUModuleOp"> { + let summary = "Lowers a `gpu.module` to a binary annotation with NVVM code."; + let description = [{ + Lowers a `gpu.module` with NVVM LLVM IR to a binary annotation compatible + with clang's new offload driver, and with the tool: `clang-offload-packager`. + For more information: + https://clang.llvm.org/docs/OffloadingDesign.html + https://clang.llvm.org/docs/ClangOffloadPackager.html + }]; + let options = [ + Option<"triple", "triple", "std::string", + /*default=*/ "\"nvptx64-nvidia-cuda\"", + "Target triple.">, + Option<"chip", "chip", "std::string", + /*default=*/"\"sm_35\"", + "Target chip.">, + Option<"features", "features", "std::string", + /*default=*/"\"+ptx60\"", + "Target features.">, + Option<"cudaPath", "cuda-path", "std::string", + /*default=*/"gpu::getDefaultCudaToolkitPath().str()", + "CUDA Toolkit path.">, + Option<"optLevel", "O", "unsigned", + /*default=*/"0", + "Optimization level.">, + ListOption<"bcPaths", "libs", "std::string", + "Extra bitcode libraries paths.">, + ]; + let dependentDialects = ["mlir::gpu::GPUDialect"]; +} + +def GpuToAMDGPUOffload: Pass<"gpu-to-amdgpu", + "::mlir::gpu::GPUModuleOp"> { + let summary = "Lowers a `gpu.module` to a binary annotation with ROCDl code."; + let description = [{ + Lowers a `gpu.module` with ROCDl LLVM IR to a binary annotation compatible + with clang's new offload driver, and with the tool: `clang-offload-packager`. + For more information: + https://clang.llvm.org/docs/OffloadingDesign.html + https://clang.llvm.org/docs/ClangOffloadPackager.html + }]; + let options = [ + Option<"triple", "triple", "std::string", + /*default=*/ "\"amdgcn-amd-amdhsa\"", + "Target triple.">, + Option<"chip", "chip", "std::string", + /*default=*/"\"generic\"", + "Target chip.">, + Option<"features", "features", "std::string", + /*default=*/"\"\"", + "Target features.">, + Option<"rocmPath", "rocm-path", "std::string", + /*default=*/"gpu::getDefaultRocmPath().str()", + "ROCm path.">, + Option<"optLevel", "O", "unsigned", + /*default=*/"0", + "Optimization level.">, + Option<"wave64", "wave64", "bool", + /*default=*/"true", + "Use Wave64 mode.">, + Option<"daz", "daz", "bool", + /*default=*/"false", + "Enable denormals are zero opt.">, + Option<"finiteOnly", "finite-only", "bool", + /*default=*/"false", + "Enable finite only opt.">, + Option<"unsafeMath", "unsafe-math", "bool", + /*default=*/"false", + "Enable unsafe math opt.">, + Option<"fastMath", "fast-math", "bool", + /*default=*/"false", + "Enable fast relaxed math opt.">, + Option<"correctSqrt", "correct-sqrt", "bool", + /*default=*/"true", + "Enable correct rounded sqrt.">, + Option<"abiVer", "abi-ver", "std::string", + /*default=*/"\"400\"", + "ABI version.">, + ListOption<"bcPaths", "libs", "std::string", + "Extra bitcode libraries paths.">, + ]; + let dependentDialects = ["mlir::gpu::GPUDialect"]; +} + #endif // MLIR_DIALECT_GPU_PASSES + diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -41,6 +41,8 @@ namespace mlir { #define GEN_PASS_DEF_GPUTOLLVMCONVERSIONPASS #include "mlir/Conversion/Passes.h.inc" +#define GEN_PASS_DEF_GPUTOOFFLOADCONVERSIONPASS +#include "mlir/Conversion/Passes.h.inc" } // namespace mlir using namespace mlir; @@ -58,6 +60,62 @@ void runOnOperation() override; }; +class GpuToOffloadConversionPass + : public impl::GpuToOffloadConversionPassBase { +public: + using Base::Base; + + void runOnOperation() final; +}; + +// Class for creating and managing offload entries & objects. +// For more information on these entries: +// https://clang.llvm.org/docs/OffloadingDesign.html#generating-offloading-entries +// https://clang.llvm.org/docs/ClangOffloadPackager.html +// https://clang.llvm.org/docs/ClangLinkerWrapper.html +struct GPUOffloadBuilder { + // Offload string constants. + static constexpr char kOffloadStructTypeName[] = "struct.__tgt_offload_entry"; + static constexpr char kOffloadEntryName[] = ".omp_offloading.entry_name."; + static constexpr char kOffloadEntry[] = ".omp_offloading.entry."; + + static constexpr char kOffloadObjectGlobalId[] = "llvm.embedded.object"; + static constexpr char kCompilerUsedGlobalId[] = "llvm.compiler.used"; + + static constexpr char kLLVMOffloadingSection[] = ".llvm.offloading"; + static constexpr char kLLVMMetadataSection[] = "llvm.metadata"; + static constexpr char kCudaOffloadingSection[] = "cuda_offloading_entries"; + static constexpr char kHipOffloadingSection[] = "hip_offloading_entries"; + + // Get the offloading entry type, described in: + static LLVM::LLVMStructType getOffloadEntryType(MLIRContext *context); + + // Create an unique identifier for the kernel combining the module and kernel + // name: __M_K. + static std::string getUniqueIdentifier(gpu::LaunchFuncOp kernelLaunch); + + // Create an identifier for the kernel stub: _stub. + static std::string getStubIdentifier(gpu::LaunchFuncOp kernelLaunch); + + // Create a kernel host stub, needed for function registration by the + // runtimes. + static std::string createKernelStub(gpu::LaunchFuncOp kernelLaunch, + OpBuilder &builder); + + // Inserts an offloading entry into the the top module. + static LogicalResult insertOffloadEntry(gpu::LaunchFuncOp kernelLaunch, + OpBuilder &builder); + + // Calls all the required methods to obtain a valid offloading launch address. + static Value getOrCreateLaunchAddress(gpu::LaunchFuncOp kernelLaunch, + OpBuilder &builder); + + // Concatenates all offloading objects into a single object and adds it at the + // start of the top module. + static LogicalResult createOffloadObject(ModuleOp module, + LLVMTypeConverter &converter); +}; + struct FunctionCallBuilder { FunctionCallBuilder(StringRef functionName, Type returnType, ArrayRef argumentTypes) @@ -308,10 +366,12 @@ public: ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter, StringRef gpuBinaryAnnotation, - bool kernelBarePtrCallConv) + bool kernelBarePtrCallConv, + bool llvmOffload = false) : ConvertOpToGpuRuntimeCallPattern(typeConverter), gpuBinaryAnnotation(gpuBinaryAnnotation), - kernelBarePtrCallConv(kernelBarePtrCallConv) {} + kernelBarePtrCallConv(kernelBarePtrCallConv), llvmOffload(llvmOffload) { + } private: Value generateParamsArray(gpu::LaunchFuncOp launchOp, OpAdaptor adaptor, @@ -325,6 +385,7 @@ llvm::SmallString<32> gpuBinaryAnnotation; bool kernelBarePtrCallConv; + bool llvmOffload; }; class EraseGpuModuleOpPattern : public OpRewritePattern { @@ -408,6 +469,315 @@ signalPassFailure(); } +void GpuToOffloadConversionPass::runOnOperation() { + LowerToLLVMOptions options(&getContext()); + options.useOpaquePointers = useOpaquePointers; + + LLVMTypeConverter converter(&getContext(), options); + + // Create the offload object. + if (failed(GPUOffloadBuilder::createOffloadObject(getOperation(), converter))) + signalPassFailure(); + + RewritePatternSet patterns(&getContext()); + LLVMConversionTarget target(getContext()); + + target.addIllegalDialect(); + + mlir::arith::populateArithToLLVMConversionPatterns(converter, patterns); + mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns); + populateVectorToLLVMConversionPatterns(converter, patterns); + populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns); + populateFuncToLLVMConversionPatterns(converter, patterns); + populateAsyncStructuralTypeConversionsAndLegality(converter, patterns, + target); + populateGpuToLLVMOffloadConversionPatterns(converter, patterns, + kernelBarePtrCallConv); + + if (failed( + applyPartialConversion(getOperation(), target, std::move(patterns)))) + signalPassFailure(); +} + +LogicalResult makeLLVMOffloadObject(ModuleOp module, + LLVMTypeConverter &converter) { + return GPUOffloadBuilder::createOffloadObject(module, converter); +} + +LLVM::LLVMStructType +GPUOffloadBuilder::getOffloadEntryType(MLIRContext *context) { + auto type = + LLVM::LLVMStructType::getIdentified(context, kOffloadStructTypeName); + if (!type.isInitialized()) { + // Create the offload struct entry according to: + // https://clang.llvm.org/docs/OffloadingDesign.html#generating-offloading-entries + auto ptrType = LLVM::LLVMPointerType::get(context); + auto i32Type = IntegerType::get(context, 32); + auto i64Type = IntegerType::get(context, 64); + auto result = + type.setBody({ptrType, ptrType, i64Type, i32Type, i32Type}, false); + if (!result.succeeded()) + return nullptr; + } + return type; +} + +std::string +GPUOffloadBuilder::getUniqueIdentifier(gpu::LaunchFuncOp kernelLaunch) { + return "_M" + kernelLaunch.getKernelModuleName().str() + "_K" + + kernelLaunch.getKernelName().str(); +} + +std::string +GPUOffloadBuilder::getStubIdentifier(gpu::LaunchFuncOp kernelLaunch) { + return getUniqueIdentifier(kernelLaunch) + "_stub"; +} + +std::string GPUOffloadBuilder::createKernelStub(gpu::LaunchFuncOp kernelLaunch, + OpBuilder &builder) { + // Create the stub name. + auto name = getStubIdentifier(kernelLaunch); + + // Get the top module. + auto module = kernelLaunch->getParentOfType(); + { + // Avoid inserting the stub more than once. + auto op = module.lookupSymbol(name); + if (op) + return name; + } + auto insertionGuard = ConversionPatternRewriter::InsertionGuard(builder); + builder.setInsertionPointToStart(&module.getRegion().front()); + + // Create a simple function stub `void()`, what's important is the address of + // the function for doing the kernel registration. + auto voidType = LLVM::LLVMVoidType::get(builder.getContext()); + auto func = builder.create( + kernelLaunch.getLoc(), name, LLVM::LLVMFunctionType::get(voidType, {}), + LLVM::Linkage::External, true); + auto block = func.addEntryBlock(); + builder.setInsertionPointToStart(block); + builder.create(kernelLaunch.getLoc(), ValueRange()); + return name; +} + +LogicalResult +GPUOffloadBuilder::insertOffloadEntry(gpu::LaunchFuncOp kernelLaunch, + OpBuilder &builder) { + + using namespace LLVM; + auto module = kernelLaunch->getParentOfType(); + if (!module) { + emitError(kernelLaunch.getLoc(), "operation is not inside of a ModuleOp."); + return failure(); + } + // Create the identifiers for the entries. + auto name = kernelLaunch.getKernelName(); + auto kernelUid = getUniqueIdentifier(kernelLaunch); + std::string entryNameId = kOffloadEntryName + kernelUid; + std::string entryId = kOffloadEntry + kernelUid; + { + // Avoid inserting the entry more than once. + auto entryName = module.lookupSymbol(entryNameId); + auto entry = module.lookupSymbol(entryId); + // Entries are already there. + if (entryName && entry) + return success(); + // One of the entries is missing. + if ((entry && !entryName) || (!entry && entryName)) { + emitError(kernelLaunch.getLoc(), + "one of the offloading entries is missing."); + return failure(); + } + } + + // For information on these entries see: + // https://clang.llvm.org/docs/OffloadingDesign.html#generating-offloading-entries + // Define and create useful variables. + auto context = builder.getContext(); + auto loc = module.getLoc(); + auto ptrType = LLVM::LLVMPointerType::get(context); + auto i32Type = builder.getI32Type(); + auto i64Type = builder.getI64Type(); + + // Set the insertion point to the start of the module. + auto insertionGuard = ConversionPatternRewriter::InsertionGuard(builder); + builder.setInsertionPointToStart(&module.getRegion().front()); + + // Obtain the kernel name including the null terminator. + auto kernelName = + builder.getStringAttr(StringRef(name.data(), name.size() + 1)); + + // Create the kernel name offloading entry. + auto stringCnstType = + LLVMArrayType::get(IntegerType::get(context, 8), kernelName.size()); + GlobalOp entryName = builder.create( + loc, /* Type */ stringCnstType, /* Constant */ true, + /* Name */ entryNameId, + /* Linkage */ Linkage::Internal, /* DSO local */ false, + /* Thread local */ false, /* Value */ kernelName, + /* Alignment */ nullptr, /* Address space */ 0, + /* Unnamed address */ + UnnamedAddrAttr::get(context, UnnamedAddr::Global), + /* Section */ nullptr); + + // Determine the offloading section kind. + auto kernelModule = SymbolTable::lookupNearestSymbolFrom( + kernelLaunch, kernelLaunch.getKernelModuleName()); + if (!module) { + emitError(kernelLaunch.getLoc(), "expected a kernel module"); + return failure(); + } + StringRef offloadSection; + if (auto attr = kernelModule->getAttr(gpu::getGpuOffloadKindAnnotation())) + if (auto strAttr = dyn_cast(attr)) { + gpu::OffloadKind offloadKind = gpu::getOffloadKind(strAttr.getValue()); + if (offloadKind == gpu::OffloadKind::cuda) + offloadSection = kCudaOffloadingSection; + else if (offloadKind == gpu::OffloadKind::hip) + offloadSection = kHipOffloadingSection; + } + if (offloadSection.empty()) { + emitError(kernelModule->getLoc(), + "the module doesn't contain a valid offloading kind."); + return failure(); + } + + // Create the offloading entry. + auto offloadEntryType = getOffloadEntryType(context); + GlobalOp offloadingEntry = builder.create( + loc, /* Type */ offloadEntryType, /* Constant */ true, + /* Name */ entryId, + /* Linkage */ Linkage::Weak, /* DSO local */ false, + /* Thread local */ false, /* Value */ nullptr, + /* Alignment */ builder.getIntegerAttr(i64Type, 1), + /* Address space */ 0, + /* Unnamed address */ nullptr, + /* Section */ builder.getStringAttr(offloadSection)); + + // Add an initializer to the global. + auto block = builder.createBlock(&offloadingEntry.getRegion()); + builder.setInsertionPointToStart(block); + + // Create an undef struct entry. + Value entryInit = builder.create(loc, offloadEntryType).getRes(); + + // Insert the stub address to the offloading entry. + auto stubAddress = builder.create( + loc, ptrType, getStubIdentifier(kernelLaunch)); + entryInit = builder.create( + entryInit.getLoc(), entryInit, stubAddress, llvm::ArrayRef{0}); + + // Insert the kernel name offloading entry to the offloading entry. + auto entryNameAddress = + builder.create(loc, ptrType, entryName.getSymName()); + entryInit = builder.create(entryInit.getLoc(), entryInit, + entryNameAddress, + llvm::ArrayRef{1}); + + // Set struct fields according to: + // https://clang.llvm.org/docs/OffloadingDesign.html#generating-offloading-entries + auto c0I64 = + builder.create(loc, builder.getIntegerAttr(i64Type, 0)); + entryInit = builder.create(entryInit.getLoc(), entryInit, + c0I64, llvm::ArrayRef{2}); + auto c0I32 = + builder.create(loc, builder.getIntegerAttr(i32Type, 0)); + entryInit = builder.create(entryInit.getLoc(), entryInit, + c0I32, llvm::ArrayRef{3}); + entryInit = builder.create(entryInit.getLoc(), entryInit, + c0I32, llvm::ArrayRef{4}); + builder.create(loc, entryInit); + return success(); +} + +Value GPUOffloadBuilder::getOrCreateLaunchAddress( + gpu::LaunchFuncOp kernelLaunch, OpBuilder &builder) { + auto ptrType = LLVM::LLVMPointerType::get(builder.getContext()); + auto stubId = createKernelStub(kernelLaunch, builder); + if (failed(insertOffloadEntry(kernelLaunch, builder))) + return {}; + return builder.create(kernelLaunch.getLoc(), ptrType, + stubId); +} + +LogicalResult +GPUOffloadBuilder::createOffloadObject(ModuleOp op, + LLVMTypeConverter &converter) { + using namespace LLVM; + OpBuilder builder(op.getContext()); + auto nativeIntType = builder.getIntegerType(converter.getPointerBitwidth()); + + // Collect all GPUModules. + SmallVector modules; + op.walk([&modules](gpu::GPUModuleOp op) { modules.push_back(op); }); + + // If there's no work finish without creating the object. + if (modules.empty()) + return success(); + + // Concatenate all offloading entries. + SmallVector binaryData; + llvm::raw_svector_ostream outputStream(binaryData); + for (auto module : modules) { + bool hasAnnotation = false; + if (auto attr = module->getAttr(gpu::getGpuOffloadObjectAnnotation())) { + if (auto bytecode = dyn_cast(attr)) { + outputStream << bytecode.getValue(); + hasAnnotation = true; + } + } + if (!hasAnnotation) { + module.emitError() << "the gpu.module doesn't contain an offload object."; + return failure(); + } + } + + // Set the insertion point to the start of the module. + builder.setInsertionPointToStart(op.getBody()); + auto stringCnstType = LLVMArrayType::get(IntegerType::get(op.getContext(), 8), + binaryData.size()); + + // Create the offload section with all the binary annotations. TODO add + // !exclude metadata to this variable see: + // https://llvm.org/docs/LangRef.html#exclude-metadata + auto offloadObject = builder.create( + op.getLoc(), /* Type */ stringCnstType, /* Constant */ true, + /* Name */ kOffloadObjectGlobalId, + /* Linkage */ Linkage::Private, /* DSO local */ false, + /* Thread local */ false, /* Value */ builder.getStringAttr(binaryData), + /* Alignment */ builder.getIntegerAttr(nativeIntType, 8), + /* Address space */ 0, + /* Unnamed address */ nullptr, + /* Section */ builder.getStringAttr(kLLVMOffloadingSection)); + + // This second global is to prevent the offloadObject from being optimized + // away. + auto ptrType = LLVM::LLVMPointerType::get(op.getContext()); + auto ptrArrayType = LLVMArrayType::get(ptrType, 1); + auto llvmMetadata = builder.create( + op.getLoc(), /* Type */ ptrArrayType, /* Constant */ false, + /* Name */ kCompilerUsedGlobalId, + /* Linkage */ Linkage::Appending, /* DSO local */ false, + /* Thread local */ false, /* Value */ nullptr, + /* Alignment */ nullptr, + /* Address space */ 0, + /* Unnamed address */ nullptr, + /* Section */ builder.getStringAttr(kLLVMMetadataSection)); + + // Insert the offloadObject to the llvmMetadata section to prevent being + // optimized away. + auto block = builder.createBlock(&llvmMetadata.getRegion()); + builder.setInsertionPointToStart(block); + Value entryInit = builder.create(op.getLoc(), ptrArrayType).getRes(); + auto stubAddress = builder.create(op.getLoc(), ptrType, + offloadObject.getName()); + entryInit = builder.create( + entryInit.getLoc(), entryInit, stubAddress, llvm::ArrayRef{0}); + builder.create(op.getLoc(), entryInit); + return success(); +} + LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder, ArrayRef arguments) const { auto module = builder.getBlock()->getParent()->getParentOfType(); @@ -759,9 +1129,10 @@ LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers()); } -// Emits LLVM IR to launch a kernel function. Expects the module that contains -// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a -// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR. +// If llvmOffload is set to false in the pattern emits LLVM IR to launch a +// kernel function. Expects the module that contains the compiled kernel +// function as a cubin in the 'nvvm.cubin' attribute, or a hsaco in the +// 'rocdl.hsaco' attribute of the kernel function in the IR. // // %0 = call %binarygetter // %1 = call %moduleLoad(%0) @@ -774,8 +1145,20 @@ // call %streamDestroy(%4) // call %moduleUnload(%1) // +// When llvmOffload is set to true, this pattern emits LLVM IR to launch a +// kernel function and expects LLVM offload annotations in the gpu.modules. In +// this case the generated code is: +// +// %0 = call %streamCreate() +// %1 = +// %2 = +// call %launchKernel(%2, , 0, %0, %1, nullptr) +// call %streamSynchronize(%0) +// call %streamDestroy(%0) +// // If the op is async, the stream corresponds to the (single) async dependency // as well as the async token the op produces. +// LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::LaunchFuncOp launchOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -795,34 +1178,45 @@ Location loc = launchOp.getLoc(); - // Create an LLVM global with CUBIN extracted from the kernel annotation and - // obtain a pointer to the first byte in it. - auto kernelModule = SymbolTable::lookupNearestSymbolFrom( - launchOp, launchOp.getKernelModuleName()); - assert(kernelModule && "expected a kernel module"); - - auto binaryAttr = - kernelModule->getAttrOfType(gpuBinaryAnnotation); - if (!binaryAttr) { - kernelModule.emitOpError() - << "missing " << gpuBinaryAnnotation << " attribute"; - return failure(); + Value kernelAddress, moduleAddress; + // If llvmOffload is set to false, generate the traditional offload code. + if (!llvmOffload) { + // Create an LLVM global with CUBIN extracted from the kernel annotation and + // obtain a pointer to the first byte in it. + auto kernelModule = SymbolTable::lookupNearestSymbolFrom( + launchOp, launchOp.getKernelModuleName()); + assert(kernelModule && "expected a kernel module"); + + auto binaryAttr = + kernelModule->getAttrOfType(gpuBinaryAnnotation); + if (!binaryAttr) { + kernelModule.emitOpError() + << "missing " << gpuBinaryAnnotation << " attribute"; + return failure(); + } + SmallString<128> nameBuffer(kernelModule.getName()); + nameBuffer.append(kGpuBinaryStorageSuffix); + Value data = LLVM::createGlobalString( + loc, rewriter, nameBuffer.str(), binaryAttr.getValue(), + LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers()); + + moduleAddress = + moduleLoadCallBuilder.create(loc, rewriter, data).getResult(); + // Get the function from the module. The name corresponds to the name of + // the kernel function. + auto kernelName = generateKernelNameConstant( + launchOp.getKernelModuleName().getValue(), + launchOp.getKernelName().getValue(), loc, rewriter); + kernelAddress = moduleGetFunctionCallBuilder + .create(loc, rewriter, {moduleAddress, kernelName}) + .getResult(); + } else { + kernelAddress = + GPUOffloadBuilder::getOrCreateLaunchAddress(launchOp, rewriter); + if (!kernelAddress) + return failure(); } - SmallString<128> nameBuffer(kernelModule.getName()); - nameBuffer.append(kGpuBinaryStorageSuffix); - Value data = LLVM::createGlobalString( - loc, rewriter, nameBuffer.str(), binaryAttr.getValue(), - LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers()); - - auto module = moduleLoadCallBuilder.create(loc, rewriter, data); - // Get the function from the module. The name corresponds to the name of - // the kernel function. - auto kernelName = generateKernelNameConstant( - launchOp.getKernelModuleName().getValue(), - launchOp.getKernelName().getValue(), loc, rewriter); - auto function = moduleGetFunctionCallBuilder.create( - loc, rewriter, {module.getResult(), kernelName}); Value zero = rewriter.create(loc, llvmInt32Type, 0); Value stream = adaptor.getAsyncDependencies().empty() @@ -836,7 +1230,7 @@ : zero; launchKernelCallBuilder.create( loc, rewriter, - {function.getResult(), adaptor.getGridSizeX(), adaptor.getGridSizeY(), + {kernelAddress, adaptor.getGridSizeX(), adaptor.getGridSizeY(), adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(), dynamicSharedMemorySize, stream, kernelParams, /*extra=*/nullpointer}); @@ -852,7 +1246,8 @@ streamDestroyCallBuilder.create(loc, rewriter, stream); rewriter.eraseOp(launchOp); } - moduleUnloadCallBuilder.create(loc, rewriter, module.getResult()); + if (!llvmOffload) + moduleUnloadCallBuilder.create(loc, rewriter, moduleAddress); return success(); } @@ -978,6 +1373,30 @@ ConvertWaitOpToGpuRuntimeCallPattern, ConvertAsyncYieldToGpuRuntimeCallPattern>(converter); patterns.add( - converter, gpuBinaryAnnotation, kernelBarePtrCallConv); + converter, gpuBinaryAnnotation, kernelBarePtrCallConv, + /* disable llvmOffload */ false); + patterns.add(&converter.getContext()); +} + +void mlir::populateGpuToLLVMOffloadConversionPatterns( + LLVMTypeConverter &converter, RewritePatternSet &patterns, + bool kernelBarePtrCallConv) { + converter.addConversion([&converter](gpu::AsyncTokenType type) -> Type { + return converter.getPointerType( + IntegerType::get(&converter.getContext(), 8)); + }); + patterns.add(converter); + patterns.add( + converter, gpu::getDefaultGpuBinaryAnnotation(), kernelBarePtrCallConv, + /* enable llvmOffload */ true); patterns.add(&converter.getContext()); } diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -47,7 +47,9 @@ Transforms/AllReduceLowering.cpp Transforms/AsyncRegionRewriter.cpp Transforms/KernelOutlining.cpp + Transforms/GpuToDeviceOffload.cpp Transforms/MemoryPromotion.cpp + Transforms/NameMangling.cpp Transforms/ParallelLoopMapper.cpp Transforms/SerializeToBlob.cpp Transforms/SerializeToCubin.cpp @@ -88,6 +90,28 @@ add_subdirectory(TransformOps) +if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD) + if (NOT DEFINED CUDAToolkit_ROOT) + find_package(CUDAToolkit) + get_filename_component(CUDAToolkit_ROOT ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE) + endif() + + # Enable gpu-to-nvptx pass. + target_compile_definitions(obj.MLIRGPUTransforms + PRIVATE + MLIR_GPU_TO_NVPTX_PASS_ENABLE=1 + # This variable should be set to CUDAToolkit_LIBRARY_ROOT, however the + # variable is unset for recent cuda toolkits, see: + # https://gitlab.kitware.com/cmake/cmake/-/issues/24858 + __DEFAULT_CUDATOOLKIT_PATH__="${CUDAToolkit_ROOT}" + ) + +target_link_libraries(MLIRGPUTransforms + PRIVATE + MLIRNVVMToLLVMIRTranslation +) +endif() + if(MLIR_ENABLE_CUDA_RUNNER) if(NOT MLIR_ENABLE_CUDA_CONVERSIONS) message(SEND_ERROR @@ -127,16 +151,29 @@ endif() +if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD) + # Enable gpu-to-amdgpu pass. + set(ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs") + target_compile_definitions(obj.MLIRGPUTransforms + PRIVATE + MLIR_GPU_TO_AMDGPU_PASS_ENABLE=1 + __DEFAULT_ROCM_PATH__="${ROCM_PATH}" + ) + + target_link_libraries(MLIRGPUTransforms + PRIVATE + MLIRROCDLToLLVMIRTranslation + ) +endif() + if(MLIR_ENABLE_ROCM_CONVERSIONS) if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)) message(SEND_ERROR "Building mlir with ROCm support requires the AMDGPU backend") endif() - set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs") target_compile_definitions(obj.MLIRGPUTransforms PRIVATE - __DEFAULT_ROCM_PATH__="${DEFAULT_ROCM_PATH}" MLIR_GPU_TO_HSACO_PASS_ENABLE=1 ) diff --git a/mlir/lib/Dialect/GPU/Transforms/GpuToDeviceObjectCommon.h b/mlir/lib/Dialect/GPU/Transforms/GpuToDeviceObjectCommon.h new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Transforms/GpuToDeviceObjectCommon.h @@ -0,0 +1,516 @@ +//===- GpuToDeviceObjectCommon.h - GPU to Device object utilities ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a series of utilities for transforming GPUModuleOps into +// binary annotations. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/ExecutionEngine/OptUtils.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Export.h" + +#include "llvm/ADT/StringExtras.h" +#include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Object/OffloadBinary.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/TargetParser/TargetParser.h" +#include "llvm/Transforms/IPO/Internalize.h" + +#include "mlir/Dialect/GPU/Transforms/Passes.h" + +namespace mlir { +namespace gpu { +// Mixin class for all GpuToDevice* passes. +// This class needs to be listed as a friend class, as it will access protected +// members. +template +class GpuToDeviceOffloadMixin { +private: + Derived &getDerived() { return static_cast(*this); } + +protected: + // Function interfaces to be implemented by the final class. + // Return the optimization level, -1 signifies don't run the optimization + // pipeline. + int getOptLevel() { return -1; } + + // Hook for loading bitcode files, returns std::nullopt on failure. + std::optional>> + loadBitcodeFiles(llvm::LLVMContext &context, llvm::Module &module) { + return SmallVector>{}; + } + + // Hook for performing additional actions on a loaded bitcode file. + void handleBitcodeFile(llvm::Module &module) {} + + // Hook for performing additional actions on the llvmModule pre linking. + void handleModulePreLink(llvm::Module &module) {} + + // Hook for performing additional actions on the llvmModule post linking. + void handleModulePostLink(llvm::Module &module) {} + +protected: + // Create the target machine based on the target triple and chip. + std::unique_ptr createTargetMachine(); + + // Loads a bitcode file from path. + std::unique_ptr loadBitcodeFile(llvm::LLVMContext &context, + StringRef path); + + // Loads multiple bitcode files. + LogicalResult + loadBitcodeFileList(llvm::LLVMContext &context, + ArrayRef fileList, + SmallVector> &llvmModules, + bool failureOnError = true); + + // Translates the gpu.module to LLVM IR. + std::unique_ptr + translateToLLVMIR(llvm::LLVMContext &llvmContext); + + // Link the llvmModule to other bitcode file. + LogicalResult linkFiles(llvm::Module &module, + SmallVector> &&libs); + + // Optimize the module. + LogicalResult optimizeModule(llvm::Module &module, + llvm::TargetMachine &targetMachine, + int optLevel = 3); + + // Serializes the LLVM IR bitcode to a special object file described in: + // https://clang.llvm.org/docs/ClangOffloadPackager.html + SmallVector serializeModuleToObject(llvm::Module &llvmModule, + gpu::OffloadKind offloadKind); + + // Insert the binary annotation to the GPUModule. + void insertAnnotations(gpu::GPUModuleOp module, + SmallVector &binaryObject, + gpu::OffloadKind offloadKind); + + // Run the pass. + void run(gpu::OffloadKind offloadKind); +}; + +template +std::unique_ptr +GpuToDeviceOffloadMixin::createTargetMachine() { + auto &self = getDerived(); + Location loc = self.getOperation().getLoc(); + std::string error; + + // Load the target. + const llvm::Target *target = + llvm::TargetRegistry::lookupTarget(self.triple, error); + if (!target) { + emitError(loc, Twine("failed to lookup target: ") + error); + return {}; + } + + // Create the target machine using the target. + llvm::TargetMachine *machine = target->createTargetMachine( + self.triple, self.chip, self.features, {}, {}); + if (!machine) { + emitError(loc, "failed to create target machine"); + return {}; + } + return std::unique_ptr{machine}; +} + +template +std::unique_ptr +GpuToDeviceOffloadMixin::loadBitcodeFile(llvm::LLVMContext &context, + StringRef path) { + auto &self = getDerived(); + llvm::SMDiagnostic error; + std::unique_ptr library = + llvm::getLazyIRFileModule(path, error, context); + if (!library) { + self.getOperation().emitError() << "Failed to load file from " << path + << ", error: " << error.getMessage(); + return nullptr; + } + return library; +} + +template +LogicalResult GpuToDeviceOffloadMixin::loadBitcodeFileList( + llvm::LLVMContext &context, ArrayRef fileList, + SmallVector> &llvmModules, + bool failureOnError) { + auto &self = getDerived(); + for (const std::string &str : fileList) { + // Test if the path exists, if it doesn't abort. + StringRef pathRef = StringRef(str.data(), str.size()); + if (!llvm::sys::fs::is_regular_file(pathRef)) { + self.getOperation().emitError() + << "File path: " << pathRef << " does not exist or is not a file.\n"; + return failure(); + } + // Load the file or abort on error. + if (auto bcFile = loadBitcodeFile(context, pathRef)) + llvmModules.push_back(std::move(bcFile)); + else if (failureOnError) + return failure(); + } + return success(); +} + +template +std::unique_ptr +GpuToDeviceOffloadMixin::translateToLLVMIR( + llvm::LLVMContext &llvmContext) { + return translateModuleToLLVMIR(getDerived().getOperation(), llvmContext, + "LLVMDialectModule"); +} + +template +LogicalResult GpuToDeviceOffloadMixin::linkFiles( + llvm::Module &module, SmallVector> &&libs) { + auto &self = getDerived(); + if (libs.empty()) + return success(); + llvm::Linker linker(module); + for (std::unique_ptr &libModule : libs) { + // This bitcode linking code is substantially similar to what is used in + // hip-clang It imports the library functions into the module, allowing LLVM + // optimization passes (which must run after linking) to optimize across the + // libraries and the module's code. We also only import symbols if they are + // referenced by the module or a previous library since there will be no + // other source of references to those symbols in this compilation and since + // we don't want to bloat the resulting code object. + bool err = linker.linkInModule( + std::move(libModule), llvm::Linker::Flags::LinkOnlyNeeded, + [](llvm::Module &m, const StringSet<> &gvs) { + llvm::internalizeModule(m, [&gvs](const llvm::GlobalValue &gv) { + return !gv.hasName() || (gvs.count(gv.getName()) == 0); + }); + }); + // True is linker failure + if (err) { + self.getOperation().emitError( + "unrecoverable failure during device library linking."); + // We have no guaranties about the state of `ret`, so bail + return failure(); + } + } + return success(); +} + +template +LogicalResult GpuToDeviceOffloadMixin::optimizeModule( + llvm::Module &module, llvm::TargetMachine &targetMachine, int optLevel) { + auto &self = getDerived(); + if (optLevel < 0 || optLevel > 3) + return self.getOperation().emitError() + << "Invalid optimization level" << optLevel << "\n"; + + targetMachine.setOptLevel(static_cast(optLevel)); + + auto transformer = + makeOptimizingTransformer(optLevel, /*sizeLevel=*/0, &targetMachine); + auto error = transformer(&module); + if (error) { + InFlightDiagnostic mlirError = self.getOperation()->emitError(); + llvm::handleAllErrors( + std::move(error), [&mlirError](const llvm::ErrorInfoBase &ei) { + mlirError << "Could not optimize LLVM IR: " << ei.message() << "\n"; + }); + return mlirError; + } + return success(); +} + +template +SmallVector GpuToDeviceOffloadMixin::serializeModuleToObject( + llvm::Module &llvmModule, gpu::OffloadKind offloadKind) { + using namespace llvm; + using namespace llvm::object; + auto &self = getDerived(); + + // Set the offload kind. + llvm::object::OffloadKind offKind = OFK_None; + if (offloadKind == gpu::OffloadKind::cuda) + offKind = OFK_Cuda; + else if (offloadKind == gpu::OffloadKind::hip) + offKind = OFK_HIP; + + SmallVector offloadData; + { + // Create the offload object, for more information check: + // https://clang.llvm.org/docs/ClangOffloadPackager.html + OffloadBinary::OffloadingImage imageBinary{}; + std::unique_ptr buffer; + // Add a scope to trash the binaryObject buffer as soon it's done being + // used. + { + SmallVector binaryData; + // Write the LLVM module bitcode to a buffer. + raw_svector_ostream outputStream(binaryData); + WriteBitcodeToFile(llvmModule, outputStream); + imageBinary.TheImageKind = IMG_Bitcode; + imageBinary.TheOffloadKind = offKind; + imageBinary.StringData["triple"] = self.triple; + // Avoid setting the arch if no arch was given in the cmd, as clang will + // compile code only for this arch if set, so running the code on an + // incompatible arch will result in error. + if (!self.chip.isDefaultOption()) + imageBinary.StringData["arch"] = self.chip; + imageBinary.Image = MemoryBuffer::getMemBuffer( + StringRef(binaryData.data(), binaryData.size()), "", false); + buffer = OffloadBinary::write(imageBinary); + } + // Check that the image was properly created. This step was taken from: + // https://github.com/llvm/llvm-project/blob/main/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp + if (buffer->getBufferSize() % OffloadBinary::getAlignment() != 0) { + emitError(self.getOperation().getLoc(), + "Offload binary has an invalid size alignment"); + return {}; + } + // Write the buffer. + raw_svector_ostream outputStream(offloadData); + outputStream << buffer->getBuffer(); + } + return offloadData; +} + +template +void GpuToDeviceOffloadMixin::insertAnnotations( + gpu::GPUModuleOp module, SmallVector &binaryObject, + gpu::OffloadKind offloadKind) { + auto &self = getDerived(); + // Create a pair of annotations to store the object and the offload kind. + module->setAttr( + gpu::getGpuOffloadObjectAnnotation(), + StringAttr::get(&self.getContext(), + StringRef(binaryObject.data(), binaryObject.size()))); + module->setAttr( + gpu::getGpuOffloadKindAnnotation(), + StringAttr::get(&self.getContext(), gpu::fromOffloadKind(offloadKind))); +} + +template +void GpuToDeviceOffloadMixin::run(gpu::OffloadKind offloadKind) { + auto &self = getDerived(); + // Translate the GPUModule to LLVM IR. + llvm::LLVMContext llvmContext; + std::unique_ptr llvmModule = + self.translateToLLVMIR(llvmContext); + if (!llvmModule) + return self.signalPassFailure(); + + // Create the target machine. + std::unique_ptr targetMachine = + self.createTargetMachine(); + if (!targetMachine) + return self.signalPassFailure(); + + // Set the data layout and target triple of the module. + llvmModule->setDataLayout(targetMachine->createDataLayout()); + llvmModule->setTargetTriple(targetMachine->getTargetTriple().getTriple()); + + // Link bitcode files. + self.handleModulePreLink(*llvmModule); + { + auto libs = self.loadBitcodeFiles(llvmContext, *llvmModule); + if (!libs) + return self.signalPassFailure(); + if (libs->size()) + if (failed(self.linkFiles(*llvmModule, std::move(*libs)))) + return self.signalPassFailure(); + self.handleModulePostLink(*llvmModule); + } + + // Optimize the module. + auto optLevel = self.getOptLevel(); + if (optLevel != -1 && + failed(self.optimizeModule(*llvmModule, *targetMachine, optLevel))) + return self.signalPassFailure(); + + // Serialize the LLVM Module to an object file. + auto binaryObject = self.serializeModuleToObject(*llvmModule, offloadKind); + auto op = self.getOperation(); + if (binaryObject.empty()) { + emitError(op.getLoc(), "Failed to serialize to bitcode."); + return self.signalPassFailure(); + } + + // Insert the binary object to the module. + self.insertAnnotations(op, binaryObject, offloadKind); +} + +// NVPTX specific mixin implementation of common functions for +// `GpuToDeviceOffloadMixin`. This class needs to be listed as a friend class, +// as it will access protected members. +template +class GPUToNVPTXMixin { +private: + Derived &getDerived() { return static_cast(*this); } + +protected: + using NVPTXBase = GPUToNVPTXMixin; + + // Implementation of GpuToDeviceOffloadMixin::loadBitcodeFiles. It can be used + // in child classes by adding `using NVPTXBase::loadBitcodeFiles;`. + std::optional>> + loadBitcodeFiles(llvm::LLVMContext &context, llvm::Module &module) { + auto &self = getDerived(); + SmallVector> bcFiles; + + // Try to load libdevice from a cuda installation. + StringRef pathRef(self.cudaPath.getValue()); + if (pathRef.size()) { + SmallVector path; + path.insert(path.begin(), pathRef.begin(), pathRef.end()); + pathRef = StringRef(path.data(), path.size()); + if (!llvm::sys::fs::is_directory(pathRef)) { + self.getOperation().emitError() + << "CUDA path: " << pathRef + << " does not exist or is not a directory.\n"; + return std::nullopt; + } + // TODO remove this hard coded path. + llvm::sys::path::append(path, "nvvm", "libdevice", "libdevice.10.bc"); + pathRef = StringRef(path.data(), path.size()); + if (!llvm::sys::fs::is_regular_file(pathRef)) { + self.getOperation().emitError() + << "LibDevice path: " << pathRef + << " does not exist or is not a file.\n"; + return std::nullopt; + } + if (auto bcFile = self.loadBitcodeFile(context, pathRef)) + bcFiles.push_back(std::move(bcFile)); + } + + // Add extra libraries. + if (failed(self.loadBitcodeFileList(context, self.bcPaths, bcFiles, true))) + return std::nullopt; + return bcFiles; + } +}; + +// AMDGPU specific mixin implementation of common functions for +// `GpuToDeviceOffloadMixin`. This class needs to be listed as a friend class, +// as it will access protected members. +template +class GPUToAMDGPUMixin { +private: + Derived &getDerived() { return static_cast(*this); } + +protected: + using AMDGPUBase = GPUToAMDGPUMixin; + + // Get the paths of ROCm device libraries. Function adapted from: + // https://github.com/llvm/llvm-project/blob/main/clang/lib/Driver/ToolChains/AMDGPU.cpp + void getCommonBitcodeLibs(llvm::SmallVector &libs, + SmallVector &libPath, + StringRef isaVersion, bool wave64, bool daz, + bool finiteOnly, bool unsafeMath, bool fastMath, + bool correctSqrt, StringRef abiVer) { + auto &self = getDerived(); + auto addLib = [&](StringRef path) { + if (!llvm::sys::fs::is_regular_file(path)) { + self.getOperation().emitRemark() + << "Bitcode library path: " << path + << " does not exist or is not a file.\n"; + return; + } + libs.push_back(path.str()); + }; + auto optLib = [](StringRef name, bool on) -> Twine { + return name + (on ? "_on" : "_off"); + }; + auto getLibPath = [&libPath](Twine lib) { + auto baseSize = libPath.size(); + llvm::sys::path::append(libPath, lib + ".bc"); + std::string path(StringRef(libPath.data(), libPath.size()).str()); + libPath.truncate(baseSize); + return path; + }; + + // Add ROCm device libraries. + addLib(getLibPath("ocml")); + addLib(getLibPath("ockl")); + addLib(getLibPath(optLib("oclc_daz_opt", daz))); + addLib(getLibPath(optLib("oclc_unsafe_math", unsafeMath || fastMath))); + addLib(getLibPath(optLib("oclc_finite_only", finiteOnly || fastMath))); + addLib(getLibPath(optLib("oclc_correctly_rounded_sqrt", correctSqrt))); + addLib(getLibPath(optLib("oclc_wavefrontsize64", wave64))); + addLib(getLibPath("oclc_isa_version_" + isaVersion)); + if (abiVer.size()) + addLib(getLibPath("oclc_abi_version_" + abiVer)); + } + + // Implementation of GpuToDeviceOffloadMixin::loadBitcodeFiles. It can be used + // in child classes by adding `using AMDGPUBase::loadBitcodeFiles;`. + std::optional>> + loadBitcodeFiles(llvm::LLVMContext &context, llvm::Module &module) { + SmallVector> bcFiles; + auto &self = getDerived(); + SmallVector libsPath; + + // Try to load device libraries from a ROCm installation. + StringRef pathRef(self.rocmPath.getValue()); + if (pathRef.size()) { + SmallVector path; + path.insert(path.begin(), pathRef.begin(), pathRef.end()); + // TODO remove this hard coded ROCm path. + llvm::sys::path::append(path, "amdgcn", "bitcode"); + pathRef = StringRef(path.data(), path.size()); + if (!llvm::sys::fs::is_directory(pathRef)) { + self.getOperation().emitRemark() + << "ROCm amdgcn bitcode path: " << pathRef + << " does not exist or is not a directory\n"; + return std::nullopt; + } + std::string isaVersion; + auto isaVer = llvm::AMDGPU::getIsaVersion(self.chip); + if (isaVer.Major != 0) + isaVersion = std::to_string(isaVer.Major) + + std::to_string(isaVer.Minor) + + llvm::utohexstr(isaVer.Stepping, /*lower case*/ true); + getCommonBitcodeLibs(libsPath, path, isaVersion, self.wave64, self.daz, + self.finiteOnly, self.unsafeMath, self.fastMath, + self.correctSqrt, self.abiVer); + } else + libsPath.reserve(libsPath.size() + self.bcPaths.size()); + + libsPath.insert(libsPath.end(), self.bcPaths.begin(), self.bcPaths.end()); + + // Add extra libraries. + if (failed(self.loadBitcodeFileList(context, libsPath, bcFiles, true))) + return std::nullopt; + return bcFiles; + } + + // Implementation of GpuToDeviceOffloadMixin::handleBitcodeFile. It can be + // used in child classes by adding `using AMDGPUBase::handleBitcodeFile;`. + void handleBitcodeFile(llvm::Module &module) { + // Some ROCM builds don't strip this like they should + if (auto *openclVersion = module.getNamedMetadata("opencl.ocl.version")) + module.eraseNamedMetadata(openclVersion); + // Stop spamming us with clang version numbers + if (auto *ident = module.getNamedMetadata("llvm.ident")) + module.eraseNamedMetadata(ident); + } +}; +} // namespace gpu +} // namespace mlir diff --git a/mlir/lib/Dialect/GPU/Transforms/GpuToDeviceOffload.cpp b/mlir/lib/Dialect/GPU/Transforms/GpuToDeviceOffload.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Transforms/GpuToDeviceOffload.cpp @@ -0,0 +1,195 @@ +//===- GpuToDeviceOffload.cpp - Impl. of GPU to NVPTX & AMDGPU passes -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the GpuToNVPTXOffload & GpuToAMDGPUOffload passes. +// +//===----------------------------------------------------------------------===// + +#include "GpuToDeviceObjectCommon.h" + +using namespace mlir; + +namespace mlir { +namespace gpu { +StringRef getDefaultCudaToolkitPath() { +#ifdef __DEFAULT_CUDATOOLKIT_PATH__ + return __DEFAULT_CUDATOOLKIT_PATH__; +#else + return ""; +#endif +} + +StringRef getDefaultRocmPath() { +#ifdef __DEFAULT_ROCM_PATH__ + return __DEFAULT_ROCM_PATH__; +#else + return ""; +#endif +} + +StringRef getGpuOffloadObjectAnnotation() { return "llvm.offload.object"; } + +StringRef getGpuOffloadKindAnnotation() { return "llvm.offload.kind"; } +} // namespace gpu + +#define GEN_PASS_DEF_GPUTONVPTXOFFLOAD +#include "mlir/Dialect/GPU/Transforms/Passes.h.inc" +#define GEN_PASS_DEF_GPUTOAMDGPUOFFLOAD +#include "mlir/Dialect/GPU/Transforms/Passes.h.inc" +} // namespace mlir + +#ifdef MLIR_GPU_TO_NVPTX_PASS_ENABLE +#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h" +namespace { +// NVPTX target initializer. +struct InitNVPTXTarget { + InitNVPTXTarget() { + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTargetMC(); + LLVMInitializeNVPTXAsmPrinter(); + } +}; + +// This ensures that the target is initialized once. +llvm::ManagedStatic nvptxTargetInit; +// CUDA specific Offload pass +class GpuToNVPTXOffload + : public impl::GpuToNVPTXOffloadBase, + public gpu::GpuToDeviceOffloadMixin, + public gpu::GPUToNVPTXMixin { +private: + template + friend class ::mlir::gpu::GpuToDeviceOffloadMixin; + template + friend class ::mlir::gpu::GPUToNVPTXMixin; + +public: + using Base::Base; + using NVPTXBase::loadBitcodeFiles; + + // Initialize the NVPTX target. + LogicalResult initialize(MLIRContext *context) override; + + // Add LLVM IR dialect translations to the registry. + void getDependentDialects(DialectRegistry ®istry) const override; + + void runOnOperation() final; + +protected: + // Return the optimization level, -1 signifies don't run the optimization + // pipeline. + int getOptLevel() { return optLevel.getValue(); } +}; +} // namespace + +LogicalResult GpuToNVPTXOffload::initialize(MLIRContext *context) { + *nvptxTargetInit; + return success(); +} + +void GpuToNVPTXOffload::getDependentDialects(DialectRegistry ®istry) const { + impl::GpuToNVPTXOffloadBase::getDependentDialects( + registry); + registerGPUDialectTranslation(registry); + registerLLVMDialectTranslation(registry); + registerNVVMDialectTranslation(registry); +} + +void GpuToNVPTXOffload::runOnOperation() { run(gpu::OffloadKind::cuda); } +#else +namespace { +class GpuToNVPTXOffload + : public impl::GpuToNVPTXOffloadBase { +public: + using impl::GpuToNVPTXOffloadBase::GpuToNVPTXOffloadBase; + + void runOnOperation() final { + getOperation().emitError() + << "This pass requires the NVPTX target but it wasn't built."; + return signalPassFailure(); + } +}; +} // namespace +#endif + +#ifdef MLIR_GPU_TO_AMDGPU_PASS_ENABLE +#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h" +namespace { +// AMDGPU target initializer. +struct InitAMDGPUTarget { + InitAMDGPUTarget() { + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUAsmPrinter(); + } +}; +// This ensures that the target is initialized once. +llvm::ManagedStatic amdgpuTargetInit; + +// AMDGPU specific Offload pass +class GpuToAMDGPUOffload + : public impl::GpuToAMDGPUOffloadBase, + public gpu::GpuToDeviceOffloadMixin, + public gpu::GPUToAMDGPUMixin { +private: + template + friend class ::mlir::gpu::GpuToDeviceOffloadMixin; + template + friend class ::mlir::gpu::GPUToAMDGPUMixin; + +public: + using AMDGPUBase::handleBitcodeFile; + using AMDGPUBase::loadBitcodeFiles; + using Base::Base; + // Initialize the AMDGPU target. + LogicalResult initialize(MLIRContext *context) override; + + // Add LLVM IR dialect translations to the registry. + void getDependentDialects(DialectRegistry ®istry) const override; + + void runOnOperation() final; + +protected: + // Return the optimization level, -1 signifies don't run the optimization + // pipeline. + int getOptLevel() { return optLevel.getValue(); } +}; +} // namespace + +LogicalResult GpuToAMDGPUOffload::initialize(MLIRContext *context) { + *amdgpuTargetInit; + return success(); +} + +void GpuToAMDGPUOffload::getDependentDialects(DialectRegistry ®istry) const { + impl::GpuToAMDGPUOffloadBase::getDependentDialects( + registry); + registerGPUDialectTranslation(registry); + registerLLVMDialectTranslation(registry); + registerROCDLDialectTranslation(registry); +} + +void GpuToAMDGPUOffload::runOnOperation() { run(gpu::OffloadKind::hip); } +#else +namespace { +class GpuToAMDGPUOffload + : public impl::GpuToAMDGPUOffloadBase { +public: + using impl::GpuToAMDGPUOffloadBase< + GpuToAMDGPUOffload>::GpuToAMDGPUOffloadBase; + + void runOnOperation() final { + getOperation().emitError() + << "This pass requires the AMDGPU target but it wasn't built."; + return signalPassFailure(); + } +}; +} // namespace +#endif diff --git a/mlir/lib/Dialect/GPU/Transforms/NameMangling.cpp b/mlir/lib/Dialect/GPU/Transforms/NameMangling.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Transforms/NameMangling.cpp @@ -0,0 +1,125 @@ +//===- NameMangling.cpp - Implementation of GPU symbols mangling ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the GPU dialect name mangling pass. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/Transforms/Passes.h" + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; + +namespace mlir { +#define GEN_PASS_DEF_GPUNAMEMANGLING +#include "mlir/Dialect/GPU/Transforms/Passes.h.inc" +} // namespace mlir + +namespace { +// Mangle the names of all the top symbols inside a GPUModuleOp from symbol to +// "__G_S", for all GPUModuleOps in a module. +class GpuNameMangling : public impl::GpuNameManglingBase { +public: + using Base::Base; + + // Get the mangled name for the symbol. + StringAttr getMangledName(StringAttr moduleName, StringAttr symbol); + + // Mangle all the definitions inside a particular GPUModuleOp. + LogicalResult mangleNamesInModule(gpu::GPUModuleOp module); + + // Update all the symbol uses of a particular symbol inside the top module. + // `symbolUses` is the range of symbol uses of the gpu.module name in the top + // module symbol table. + void updateSymbolUses(SymbolTable::UseRange &&symbolUses); + + void runOnOperation() final; +}; +} // namespace + +StringAttr GpuNameMangling::getMangledName(StringAttr moduleName, + StringAttr symbol) { + std::string name = "__G" + moduleName.str() + "_S" + symbol.str(); + return StringAttr::get(&getContext(), name); +} + +LogicalResult GpuNameMangling::mangleNamesInModule(gpu::GPUModuleOp gpuModule) { + SymbolTable synbolTable(gpuModule); + for (auto &op : gpuModule.getBody()->getOperations()) { + // Ignore external functions. + if (auto fn = dyn_cast(op)) + if (fn.isExternal()) + continue; + if (auto symbol = dyn_cast(op)) { + auto mangledName = + getMangledName(gpuModule.getNameAttr(), symbol.getNameAttr()); + + // Replace all the symbol uses of `symbol` to its mangled name. + if (failed(synbolTable.replaceAllSymbolUses( + symbol.getNameAttr(), mangledName, &gpuModule.getRegion()))) { + emitError(op.getLoc(), "Failed to replace the symbol name."); + return failure(); + } + + // On symbol replacement success rename the symbol. + synbolTable.setSymbolName(symbol, mangledName); + } + } + return success(); +} + +void GpuNameMangling::updateSymbolUses(SymbolTable::UseRange &&symbolUses) { + // All symbolUses correspond to a particular gpu.module name. + for (auto symbolUse : symbolUses) { + Operation *operation = symbolUse.getUser(); + SmallVector> symbolReferences; + + // Collect all references to the `symbol` in the attributes of the + // operation. + for (auto opAttr : operation->getAttrs()) { + if (auto symbol = dyn_cast(opAttr.getValue())) + if (symbol == symbolUse.getSymbolRef()) + symbolReferences.push_back({opAttr.getName(), symbol}); + } + + // Update the symbol references. + for (auto &[attrName, symbol] : symbolReferences) { + auto nestedReferences = symbol.getNestedReferences(); + if (nestedReferences.size()) { + SmallVector updatedReferences(nestedReferences); + // Only the first nested reference was updated by the previous step, + // thus we just update that one. + updatedReferences[0] = FlatSymbolRefAttr::get(getMangledName( + symbol.getRootReference(), nestedReferences[0].getRootReference())); + operation->setAttr( + attrName, + SymbolRefAttr::get(symbol.getRootReference(), updatedReferences)); + } + } + } +} + +void GpuNameMangling::runOnOperation() { + auto module = getOperation(); + SmallVector gpuModules; + // Collect all gpu.modules. + module.walk([&gpuModules](gpu::GPUModuleOp op) { gpuModules.push_back(op); }); + SymbolTable moduleTable(module); + + // Mangle the names. + for (auto gpuModule : gpuModules) { + if (failed(mangleNamesInModule(gpuModule))) + return signalPassFailure(); + if (auto symbolUses = moduleTable.getSymbolUses(gpuModule.getNameAttr(), + &module.getRegion())) + updateSymbolUses(std::move(*symbolUses)); + } +} diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -207,6 +207,31 @@ ) endif() + if(MLIR_ENABLE_CUDART_RUNNER) + find_package(CUDAToolkit REQUIRED) + + add_mlir_library(mlir_cudart_runtime + SHARED + CudaRuntimeWrappers.cpp + + EXCLUDE_FROM_LIBMLIR + ) + set_property(TARGET mlir_cudart_runtime PROPERTY CXX_STANDARD 14) + target_include_directories(mlir_cudart_runtime + PRIVATE + ${CUDAToolkit_INCLUDE_DIRS} + ) + target_link_libraries(mlir_cudart_runtime + PRIVATE + CUDA::cudart + CUDA::cuda_driver + ) + target_compile_definitions(mlir_cudart_runtime + PRIVATE + MLIR_USE_CUDART_RUNNER=1 + ) + endif() + if(MLIR_ENABLE_ROCM_RUNNER) # Configure ROCm support. if (NOT DEFINED ROCM_PATH) @@ -253,6 +278,12 @@ EXCLUDE_FROM_LIBMLIR ) + add_mlir_library(mlir_rocmrt_runtime + SHARED + RocmRuntimeWrappers.cpp + + EXCLUDE_FROM_LIBMLIR + ) # Supress compiler warnings from HIP headers check_cxx_compiler_flag(-Wno-c++98-compat-extra-semi @@ -260,32 +291,53 @@ if (CXX_SUPPORTS_CXX98_COMPAT_EXTRA_SEMI_FLAG) target_compile_options(mlir_rocm_runtime PRIVATE "-Wno-c++98-compat-extra-semi") + target_compile_options(mlir_rocmrt_runtime PRIVATE + "-Wno-c++98-compat-extra-semi") endif() check_cxx_compiler_flag(-Wno-return-type-c-linkage CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG) if (CXX_SUPPORTS_WNO_RETURN_TYPE_C_LINKAGE_FLAG) target_compile_options(mlir_rocm_runtime PRIVATE "-Wno-return-type-c-linkage") + target_compile_options(mlir_rocmrt_runtime PRIVATE + "-Wno-return-type-c-linkage") endif() check_cxx_compiler_flag(-Wno-nested-anon-types CXX_SUPPORTS_WNO_NESTED_ANON_TYPES_FLAG) if (CXX_SUPPORTS_WNO_NESTED_ANON_TYPES_FLAG) target_compile_options(mlir_rocm_runtime PRIVATE "-Wno-nested-anon-types") + target_compile_options(mlir_rocmrt_runtime PRIVATE + "-Wno-nested-anon-types") endif() check_cxx_compiler_flag(-Wno-gnu-anonymous-struct CXX_SUPPORTS_WNO_GNU_ANONYMOUS_STRUCT_FLAG) if (CXX_SUPPORTS_WNO_GNU_ANONYMOUS_STRUCT_FLAG) target_compile_options(mlir_rocm_runtime PRIVATE "-Wno-gnu-anonymous-struct") + target_compile_options(mlir_rocmrt_runtime PRIVATE + "-Wno-gnu-anonymous-struct") endif() set_property(TARGET mlir_rocm_runtime PROPERTY INSTALL_RPATH_USE_LINK_PATH ON) + set_property(TARGET mlir_rocmrt_runtime + PROPERTY INSTALL_RPATH_USE_LINK_PATH ON) + target_link_libraries(mlir_rocm_runtime PUBLIC hip::host hip::amdhip64 ) + + target_link_libraries(mlir_rocmrt_runtime + PUBLIC + hip::host hip::amdhip64 + ) + + target_compile_definitions(mlir_rocmrt_runtime + PRIVATE + MLIR_USE_HIPRT_RUNNER=1 + ) endif() endif() diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -16,15 +16,43 @@ #include +// Most of these functions should be interoperable: +// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DRIVER.html +// However to avoid any pitfalls in context management, the runtime api is used +// when available. + +#ifdef MLIR_USE_CUDART_RUNNER +#include "cuda_runtime.h" +#endif #include "cuda.h" +#ifdef MLIR_USE_CUDART_RUNNER +#define kernel_t void * +#define stream_t cudaStream_t +#define event_t cudaEvent_t +#else +#define kernel_t CUfunction +#define stream_t CUstream +#define event_t CUevent +#endif + #ifdef _WIN32 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport) #else #define MLIR_CUDA_WRAPPERS_EXPORT #endif // _WIN32 +#ifdef MLIR_USE_CUDART_RUNNER #define CUDA_REPORT_IF_ERROR(expr) \ + [](cudaError_t result) { \ + if (!result) \ + return; \ + const char *name = cudaGetErrorName(result); \ + if (!name) \ + name = ""; \ + fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \ + }(expr) +#define CUDA_DRIVER_REPORT_IF_ERROR(expr) \ [](CUresult result) { \ if (!result) \ return; \ @@ -35,6 +63,19 @@ fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \ }(expr) +#else +#define CUDA_REPORT_IF_ERROR(expr) \ + [](CUresult result) { \ + if (!result) \ + return; \ + const char *name = nullptr; \ + cuGetErrorName(result, &name); \ + if (!name) \ + name = ""; \ + fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \ + }(expr) +#define CUDA_DRIVER_REPORT_IF_ERROR(expr) CUDA_REPORT_IF_ERROR(expr) + thread_local static int32_t defaultDevice = 0; // Make the primary context of the current default device current for the @@ -60,7 +101,9 @@ ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); } }; +#endif +#ifndef MLIR_USE_CUDART_RUNNER extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) { ScopedContext scopedContext; CUmodule module = nullptr; @@ -78,84 +121,145 @@ CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name)); return function; } +#endif // The wrapper uses intptr_t instead of CUDA's unsigned int to match // the type of MLIR's index type. This avoids the need for casts in the // generated MLIR code. extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY, +mgpuLaunchKernel(kernel_t function, intptr_t gridX, intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, - intptr_t blockZ, int32_t smem, CUstream stream, void **params, + intptr_t blockZ, int32_t smem, stream_t stream, void **params, void **extra) { +#ifdef MLIR_USE_CUDART_RUNNER + CUDA_REPORT_IF_ERROR(cudaLaunchKernel(function, dim3(gridX, gridY, gridZ), + dim3(blockX, blockY, blockZ), params, + smem, stream)); +#else ScopedContext scopedContext; CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)); +#endif } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT stream_t mgpuStreamCreate() { +#ifdef MLIR_USE_CUDART_RUNNER + cudaStream_t stream; + CUDA_REPORT_IF_ERROR(cudaStreamCreate(&stream)); + return stream; +#else ScopedContext scopedContext; - CUstream stream = nullptr; + stream_t stream = nullptr; CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); return stream; +#endif } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(CUstream stream) { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(stream_t stream) { +#ifdef MLIR_USE_CUDART_RUNNER + CUDA_REPORT_IF_ERROR(cudaStreamDestroy(stream)); +#else CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream)); +#endif } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuStreamSynchronize(CUstream stream) { +mgpuStreamSynchronize(stream_t stream) { +#ifdef MLIR_USE_CUDART_RUNNER + CUDA_REPORT_IF_ERROR(cudaStreamSynchronize(stream)); +#else CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream)); +#endif } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(CUstream stream, - CUevent event) { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(stream_t stream, + event_t event) { +#ifdef MLIR_USE_CUDART_RUNNER + CUDA_REPORT_IF_ERROR(cudaStreamWaitEvent(stream, event, /*flags=*/0)); +#else CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /*flags=*/0)); +#endif } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUevent mgpuEventCreate() { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT event_t mgpuEventCreate() { +#ifdef MLIR_USE_CUDART_RUNNER + cudaEvent_t event; + CUDA_REPORT_IF_ERROR(cudaEventCreate(&event)); + return event; +#else ScopedContext scopedContext; - CUevent event = nullptr; + event_t event = nullptr; CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); return event; +#endif } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(CUevent event) { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(event_t event) { +#ifdef MLIR_USE_CUDART_RUNNER + CUDA_REPORT_IF_ERROR(cudaEventDestroy(event)); +#else CUDA_REPORT_IF_ERROR(cuEventDestroy(event)); +#endif } -extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventSynchronize(CUevent event) { +extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventSynchronize(event_t event) { +#ifdef MLIR_USE_CUDART_RUNNER + CUDA_REPORT_IF_ERROR(cudaEventSynchronize(event)); +#else CUDA_REPORT_IF_ERROR(cuEventSynchronize(event)); +#endif } -extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event, - CUstream stream) { +extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(event_t event, + stream_t stream) { +#ifdef MLIR_USE_CUDART_RUNNER + CUDA_REPORT_IF_ERROR(cudaEventRecord(event, stream)); +#else CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream)); +#endif } -extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) { +extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, stream_t /*stream*/) { +#ifdef MLIR_USE_CUDART_RUNNER + void *ptr; + CUDA_REPORT_IF_ERROR(cudaMalloc(&ptr, sizeBytes)); + return ptr; +#else ScopedContext scopedContext; CUdeviceptr ptr; CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes)); return reinterpret_cast(ptr); +#endif } -extern "C" void mgpuMemFree(void *ptr, CUstream /*stream*/) { +extern "C" void mgpuMemFree(void *ptr, stream_t /*stream*/) { +#ifdef MLIR_USE_CUDART_RUNNER + CUDA_REPORT_IF_ERROR(cudaFree(ptr)); +#else CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast(ptr))); +#endif } extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes, - CUstream stream) { + stream_t stream) { +#ifdef MLIR_USE_CUDART_RUNNER + CUDA_REPORT_IF_ERROR( + cudaMemcpyAsync(dst, src, sizeBytes, cudaMemcpyDefault, stream)); +#else CUDA_REPORT_IF_ERROR(cuMemcpyAsync(reinterpret_cast(dst), reinterpret_cast(src), sizeBytes, stream)); +#endif } extern "C" void mgpuMemset32(void *dst, unsigned int value, size_t count, - CUstream stream) { - CUDA_REPORT_IF_ERROR(cuMemsetD32Async(reinterpret_cast(dst), - value, count, stream)); + stream_t stream) { + // There's no cuda runtime equivalent for this specific function, but it + // should be interoperable: + // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DRIVER.html + CUDA_DRIVER_REPORT_IF_ERROR(cuMemsetD32Async( + reinterpret_cast(dst), value, count, stream)); } /// Helper functions for writing mlir example code @@ -164,8 +268,12 @@ // transfer functions implemented. extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) { +#ifdef MLIR_USE_CUDART_RUNNER + CUDA_REPORT_IF_ERROR(cudaHostRegister(ptr, sizeBytes, /*flags=*/0)); +#else ScopedContext scopedContext; CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0)); +#endif } /// Registers a memref with the CUDA runtime. `descriptor` is a pointer to a @@ -194,8 +302,12 @@ // Allows to unregister byte array with the CUDA runtime. extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemHostUnregister(void *ptr) { +#ifdef MLIR_USE_CUDART_RUNNER + CUDA_REPORT_IF_ERROR(cudaHostUnregister(ptr)); +#else ScopedContext scopedContext; CUDA_REPORT_IF_ERROR(cuMemHostUnregister(ptr)); +#endif } /// Unregisters a memref with the CUDA runtime. `descriptor` is a pointer to a @@ -209,5 +321,9 @@ } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) { +#ifdef MLIR_USE_CUDART_RUNNER + CUDA_REPORT_IF_ERROR(cudaSetDevice(device)); +#else defaultDevice = device; +#endif } diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp @@ -52,15 +52,27 @@ // The wrapper uses intptr_t instead of ROCM's unsigned int to match // the type of MLIR's index type. This avoids the need for casts in the // generated MLIR code. -extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX, + +#ifdef MLIR_USE_HIPRT_RUNNER +#define kernel_t void * +#else +#define kernel_t hipFunction_t +#endif +extern "C" void mgpuLaunchKernel(kernel_t function, intptr_t gridX, intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, hipStream_t stream, void **params, void **extra) { +#ifdef MLIR_USE_HIPRT_RUNNER + HIP_REPORT_IF_ERROR(hipLaunchKernel(function, dim3(gridX, gridY, gridZ), + dim3(blockX, blockY, blockZ), params, + smem, stream)); +#else HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)); +#endif } extern "C" hipStream_t mgpuStreamCreate() { diff --git a/mlir/test/Conversion/GPUCommon/offload.mlir b/mlir/test/Conversion/GPUCommon/offload.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUCommon/offload.mlir @@ -0,0 +1,129 @@ +// RUN: mlir-opt --gpu-to-offload --split-input-file -verify-diagnostics %s | FileCheck %s + +// Perform a comprehensive test of the offload annotations. +module attributes {gpu.container_module} { + // CHECK: llvm.mlir.global internal unnamed_addr constant @".omp_offloading.entry_name.[[KERNEL_ID:.*]]"("[[KERNEL_NAME:.*]]\00") {addr_space = 0 : i32} + // CHECK-NEXT: llvm.mlir.global weak constant @".omp_offloading.entry.[[KERNEL_ID]]"() + // CHECK-SAME: {addr_space = 0 : i32, alignment = 1 : i64, section = "hip_offloading_entries"} : !llvm.struct<[[STRUCT_BODY:.*]]> { + // CHECK-NEXT: %[[V0:.*]] = llvm.mlir.undef : !llvm.struct<[[STRUCT_BODY]]> + // CHECK-NEXT: %[[V1:.*]] = llvm.mlir.addressof @[[KERNEL_ID]]_stub : !llvm.ptr + // CHECK-NEXT: %[[V2:.*]] = llvm.insertvalue %[[V1]], %[[V0]][0] : !llvm.struct<[[STRUCT_BODY]]> + // CHECK-NEXT: %[[V3:.*]] = llvm.mlir.addressof @".omp_offloading.entry_name.[[KERNEL_ID]]" : !llvm.ptr + // CHECK-NEXT: %[[V4:.*]] = llvm.insertvalue %[[V3]], %[[V2]][1] : !llvm.struct<[[STRUCT_BODY]]> + // CHECK-NEXT: %[[V5:.*]] = llvm.mlir.constant(0 : i64) : i64 + // CHECK-NEXT: %[[V6:.*]] = llvm.insertvalue %[[V5]], %[[V4]][2] : !llvm.struct<[[STRUCT_BODY]]> + // CHECK-NEXT: %[[V7:.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-NEXT: %[[V8:.*]] = llvm.insertvalue %[[V7]], %[[V6]][3] : !llvm.struct<[[STRUCT_BODY]]> + // CHECK-NEXT: %[[V9:.*]] = llvm.insertvalue %[[V7]], %[[V8]][4] : !llvm.struct<[[STRUCT_BODY]]> + // CHECK-NEXT: llvm.return %[[V9]] : !llvm.struct<[[STRUCT_BODY]]> + // CHECK-NEXT: } + // CHECK: llvm.func @[[KERNEL_ID]]_stub() attributes {dso_local} { + // CHECK-NEXT: llvm.return + // CHECK-NEXT: } + // CHECK: llvm.mlir.global private constant @[[OBJECT_LABEL:.*]]("\10\FF\10\AD") {addr_space = 0 : i32, alignment = 8 : i64, section = ".llvm.offloading"} + // CHECK-NEXT: llvm.mlir.global appending @llvm.compiler.used() {addr_space = 0 : i32, section = "llvm.metadata"} : !llvm.array<1 x ptr> { + // CHECK-NEXT: %[[V0:.*]] = llvm.mlir.undef : !llvm.array<1 x ptr> + // CHECK-NEXT: %[[V1:.*]] = llvm.mlir.addressof @[[OBJECT_LABEL]] : !llvm.ptr + // CHECK-NEXT: %[[COMPILER_USED_GLOBAL:.*]] = llvm.insertvalue %[[V1]], %[[V0]][0] : !llvm.array<1 x ptr> + // CHECK-NEXT: llvm.return %[[COMPILER_USED_GLOBAL]] : !llvm.array<1 x ptr> + // CHECK-NEXT: } + // CHECK: llvm.func @host_function() { + // CHECK: %[[KERNEL:.*]] = llvm.mlir.addressof @[[KERNEL_ID]]_stub : !llvm.ptr + // CHECK: llvm.call @mgpuLaunchKernel(%[[KERNEL]], {{.*}}) : (!llvm.ptr, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> () + // CHECK-NEXT: llvm.call @mgpuStreamSynchronize({{.*}}) : (!llvm.ptr) -> () + // CHECK-NEXT: llvm.call @mgpuStreamDestroy({{.*}}) : (!llvm.ptr) -> () + // CHECK-NEXT: llvm.return + // CHECK-NEXT: } + // CHECK: llvm.func @mgpuStreamCreate() -> !llvm.ptr + // CHECK: llvm.func @mgpuLaunchKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) + // CHECK: llvm.func @mgpuStreamSynchronize(!llvm.ptr) + // CHECK: llvm.func @mgpuStreamDestroy(!llvm.ptr) + func.func @host_function() { + %c0 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + gpu.launch_func @device_module::@llvm_kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%c0: i32) + return + } + gpu.module @device_module attributes {llvm.offload.kind = "hip", llvm.offload.object = "\10\FF\10\AD"} { + llvm.func @llvm_kernel(%arg0: i32) attributes {gpu.kernel} { + llvm.return + } + } +} + +// ----- + +// Test multiple modules. +module attributes {gpu.container_module} { + // CHECK: llvm.mlir.global internal unnamed_addr constant @".omp_offloading.entry_name.[[CUDA_KERNEL_ID:.*]]"("kernel\00") {addr_space = 0 : i32} + // CHECK-NEXT: llvm.mlir.global weak constant @".omp_offloading.entry.[[CUDA_KERNEL_ID]]"() + // CHECK-SAME: {addr_space = 0 : i32, alignment = 1 : i64, section = "cuda_offloading_entries"} : !llvm.struct<[[STRUCT_BODY:.*]]> { + // CHECK: llvm.func @[[CUDA_KERNEL_ID]]_stub() attributes {dso_local} { + // CHECK: llvm.mlir.global internal unnamed_addr constant @".omp_offloading.entry_name.[[HIP_KERNEL_ID:.*]]"("kernel\00") {addr_space = 0 : i32} + // CHECK-NEXT: llvm.mlir.global weak constant @".omp_offloading.entry.[[HIP_KERNEL_ID]]"() + // CHECK-SAME: {addr_space = 0 : i32, alignment = 1 : i64, section = "hip_offloading_entries"} : !llvm.struct<[[STRUCT_BODY:.*]]> { + // CHECK: llvm.func @[[HIP_KERNEL_ID]]_stub() attributes {dso_local} { + // CHECK: llvm.mlir.global private constant @[[OBJECT_LABEL:.*]]("HIP_BLOBCUDA_BLOB") {addr_space = 0 : i32, alignment = 8 : i64, section = ".llvm.offloading"} + func.func @host_function() { + %c0 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + // CHECK: %[[HIP_KERNEL:.*]] = llvm.mlir.addressof @[[HIP_KERNEL_ID]]_stub : !llvm.ptr + // CHECK: llvm.call @mgpuLaunchKernel(%[[HIP_KERNEL]], {{.*}}) : (!llvm.ptr, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> () + gpu.launch_func @hip_module::@kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%c0: i32) + // CHECK: %[[CUDA_KERNEL:.*]] = llvm.mlir.addressof @[[CUDA_KERNEL_ID]]_stub : !llvm.ptr + // CHECK: llvm.call @mgpuLaunchKernel(%[[CUDA_KERNEL]], {{.*}}) : (!llvm.ptr, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> () + gpu.launch_func @cuda_module::@kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%c0: i32) + return + } + gpu.module @hip_module attributes {llvm.offload.kind = "hip", llvm.offload.object = "HIP_BLOB"} { + llvm.func @kernel(%arg0: i32) attributes {gpu.kernel} { + llvm.return + } + } + gpu.module @cuda_module attributes {llvm.offload.kind = "cuda", llvm.offload.object = "CUDA_BLOB"} { + llvm.func @kernel(%arg0: i32) attributes {gpu.kernel} { + llvm.return + } + } +} + +// ----- + +// Test an invalid module with no offload object attribute. +module attributes {gpu.container_module} { + func.func @host_function() { + %c0 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + gpu.launch_func @hip_module::@kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%c0: i32) + return + } + // expected-error@+1 {{the gpu.module doesn't contain an offload object}} + gpu.module @hip_module attributes {llvm.offload.kind = "hip"} { + llvm.func @kernel(%arg0: i32) attributes {gpu.kernel} { + llvm.return + } + } +} + +// ----- + +// Test an invalid module with no offload kind attribute. +module attributes {gpu.container_module} { + func.func @host_function() { + %c0 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + // expected-error@+1 {{failed to legalize operation 'gpu.launch_func'}} + gpu.launch_func @hip_module::@kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%c0: i32) + return + } + // expected-error@+1 {{the module doesn't contain a valid offloading kind}} + gpu.module @hip_module attributes {llvm.offload.object = "HIP_BLOB"} { + llvm.func @kernel(%arg0: i32) attributes {gpu.kernel} { + llvm.return + } + } +} \ No newline at end of file diff --git a/mlir/test/Dialect/GPU/mangle-names.mlir b/mlir/test/Dialect/GPU/mangle-names.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/GPU/mangle-names.mlir @@ -0,0 +1,96 @@ +// RUN: mlir-opt --gpu-name-mangling --split-input-file -verify-diagnostics %s | FileCheck %s + +// Verify that only the symbols *defined* inside the gpu.module are mangled, +// and that all the symbol references are updated accordingly. +module attributes {gpu.container_module} { + // CHECK-LABEL: func.func @bar + func.func @bar() { + return + } + + // CHECK-LABEL: func.func @host_function + func.func @host_function() { + %c0 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + // CHECK: call @bar + func.call @bar(): () -> () + // CHECK: gpu.launch_func @device_module::@__Gdevice_module_Skernel + gpu.launch_func @device_module::@kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args() + // CHECK: gpu.launch_func @device_module::@__Gdevice_module_Sllvm_kernel + gpu.launch_func @device_module::@llvm_kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%c0: i32) + return + } + + // CHECK-LABEL: gpu.module @device_module + gpu.module @device_module { + // CHECK-LABEL: func.func private @foo + func.func private @foo() + // CHECK-LABEL: func.func @__Gdevice_module_Sbar + func.func @bar() { + return + } + // CHECK-LABEL: gpu.func @__Gdevice_module_Skernel + gpu.func @kernel() kernel attributes {gpu.known_block_size = array} { + // CHECK: call @__Gdevice_module_Sbar + func.call @bar(): () -> () + gpu.return + } + // CHECK-LABEL: llvm.func @__Gdevice_module_Sllvm_kernel + llvm.func @llvm_kernel(%arg0: i32) attributes {gpu.kernel} { + llvm.return + } + } +} + +// ----- + +// Test name mangling with multiple modules. +module attributes {gpu.container_module} { + // CHECK-LABEL: func.func @host + func.func @host() { + %c0 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + // CHECK: gpu.launch_func @device_module_1::@__Gdevice_module_1_Skernel + gpu.launch_func @device_module_1::@kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args() + // CHECK: gpu.launch_func @device_module_1::@__Gdevice_module_1_Skernel_bar + gpu.launch_func @device_module_1::@kernel_bar blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args() + // CHECK: gpu.launch_func @device_module_2::@__Gdevice_module_2_Skernel + gpu.launch_func @device_module_2::@kernel blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args() + return + } + + // CHECK-LABEL: gpu.module @device_module_1 + gpu.module @device_module_1 { + // CHECK-LABEL: func.func @__Gdevice_module_1_Sbar + func.func @bar() { + return + } + // CHECK-LABEL: gpu.func @__Gdevice_module_1_Skernel + gpu.func @kernel() kernel attributes {gpu.known_block_size = array} { + // CHECK: call @__Gdevice_module_1_Sbar + func.call @bar(): () -> () + gpu.return + } + // CHECK-LABEL: gpu.func @__Gdevice_module_1_Skernel_bar + gpu.func @kernel_bar() kernel attributes {gpu.known_block_size = array} { + gpu.return + } + } + + // CHECK-LABEL: gpu.module @device_module_2 + gpu.module @device_module_2 { + // CHECK-LABEL: func.func @__Gdevice_module_2_Sbar + func.func @bar() { + return + } + // CHECK-LABEL: gpu.func @__Gdevice_module_2_Skernel + gpu.func @kernel() kernel attributes {gpu.known_block_size = array} { + // CHECK: call @__Gdevice_module_2_Sbar + func.call @bar(): () -> () + gpu.return + } + } +} + diff --git a/mlir/test/Dialect/GPU/serialize-to-amdgpu.mlir b/mlir/test/Dialect/GPU/serialize-to-amdgpu.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/GPU/serialize-to-amdgpu.mlir @@ -0,0 +1,64 @@ +// RUN: mlir-opt --gpu-to-amdgpu="rocm-path=" --split-input-file -verify-diagnostics %s | FileCheck %s + +// CHECK-LABEL: gpu.module @kernel +// CHECK-SAME: llvm.offload.kind = "hip" +// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD +gpu.module @kernel { + llvm.func @kernel() attributes {gpu.kernel, rocdl.kernel, rocdl.reqd_work_group_size = array} { + llvm.return + } +} + +// ----- + +// Test an empty module +// CHECK-LABEL: gpu.module @empty_module +// CHECK-SAME: llvm.offload.kind = "hip" +// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD +gpu.module @empty_module { +} + +// ----- + +// Test multiple modules +// CHECK-LABEL: gpu.module @module_1 +// CHECK-SAME: llvm.offload.kind = "hip" +// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD +gpu.module @module_1 { + llvm.func @kernel() attributes {gpu.kernel, rocdl.kernel, rocdl.reqd_work_group_size = array} { + llvm.return + } +} + +// CHECK-LABEL: gpu.module @module_2 +// CHECK-SAME: llvm.offload.kind = "hip" +// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD +gpu.module @module_2 { + llvm.func @bar(i32) -> f64 + llvm.func @kernel() attributes {gpu.kernel, rocdl.kernel, rocdl.reqd_work_group_size = array} { + llvm.return + } +} + +// CHECK-LABEL: gpu.module @module_3 +// CHECK-SAME: llvm.offload.kind = "hip" +// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD +gpu.module @module_3 { + llvm.func @kernel_1() attributes {gpu.kernel, rocdl.kernel, rocdl.reqd_work_group_size = array} { + llvm.return + } + llvm.func @kernel_2() attributes {gpu.kernel, rocdl.kernel, rocdl.reqd_work_group_size = array} { + llvm.return + } +} + +// ----- + +// Test a kernel with an invalid instruction. +gpu.module @kernel_invalid { + llvm.func @kernel() attributes {gpu.kernel, rocdl.kernel, rocdl.reqd_work_group_size = array} { + // expected-error@+1 {{cannot be converted to LLVM IR: missing `LLVMTranslationDialectInterface`}} + %0 = nvvm.read.ptx.sreg.ctaid.x : i32 + llvm.return + } +} diff --git a/mlir/test/Dialect/GPU/serialize-to-nvptx.mlir b/mlir/test/Dialect/GPU/serialize-to-nvptx.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/GPU/serialize-to-nvptx.mlir @@ -0,0 +1,64 @@ +// RUN: mlir-opt --gpu-to-nvptx="cuda-path=" --split-input-file -verify-diagnostics %s | FileCheck %s + +// CHECK-LABEL: gpu.module @kernel +// CHECK-SAME: llvm.offload.kind = "cuda" +// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD +gpu.module @kernel { + llvm.func @kernel() attributes {gpu.kernel, gpu.known_block_size = array, nvvm.kernel} { + llvm.return + } +} + +// ----- + +// Test an empty module +// CHECK-LABEL: gpu.module @empty_module +// CHECK-SAME: llvm.offload.kind = "cuda" +// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD +gpu.module @empty_module { +} + +// ----- + +// Test multiple modules +// CHECK-LABEL: gpu.module @module_1 +// CHECK-SAME: llvm.offload.kind = "cuda" +// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD +gpu.module @module_1 { + llvm.func @kernel() attributes {gpu.kernel, gpu.known_block_size = array, nvvm.kernel} { + llvm.return + } +} + +// CHECK-LABEL: gpu.module @module_2 +// CHECK-SAME: llvm.offload.kind = "cuda" +// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD +gpu.module @module_2 { + llvm.func @bar(i32) -> f64 + llvm.func @kernel() attributes {gpu.kernel, gpu.known_block_size = array, nvvm.kernel} { + llvm.return + } +} + +// CHECK-LABEL: gpu.module @module_3 +// CHECK-SAME: llvm.offload.kind = "cuda" +// CHECK-SAME: llvm.offload.object = "\10\FF\10\AD +gpu.module @module_3 { + llvm.func @kernel_1() attributes {gpu.kernel, gpu.known_block_size = array, nvvm.kernel} { + llvm.return + } + llvm.func @kernel_2() attributes {gpu.kernel, gpu.known_block_size = array, nvvm.kernel} { + llvm.return + } +} + +// ----- + +// Test a kernel with an invalid instruction. +gpu.module @kernel_invalid { + llvm.func @kernel() attributes {gpu.kernel, gpu.known_block_size = array, nvvm.kernel} { + // expected-error@+1 {{cannot be converted to LLVM IR: missing `LLVMTranslationDialectInterface`}} + %0 = rocdl.workgroup.id.x : i32 + llvm.return + } +} \ No newline at end of file