diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUCompilationAttr.td b/mlir/include/mlir/Dialect/GPU/IR/GPUCompilationAttr.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUCompilationAttr.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUCompilationAttr.td @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file defines the GPU NVPTX target attribute. +// This file defines the GPU NVPTX & AMGDPU target attributes. // //===----------------------------------------------------------------------===// @@ -88,4 +88,95 @@ }]; } +//===----------------------------------------------------------------------===// +// GPU AMDGPU target attribute. +//===----------------------------------------------------------------------===// + +def GPU_AMDGPUTargetAttr : GPU_Attr<"AMDGPUTarget", "amdgpu", [ + DeclareAttrInterfaceMethods + ]> { + let description = [{ + AMDGPU target attribute for controlling compilation of AMDGPU targets. All + parameters decay into default values if not present. + + Examples: + + 1. Target with default values. + ``` + gpu.module @mymodule [#gpu.amdgpu] attributes {...} { + ... + } + ``` + + 2. Target with `gfx90a` chip and fast math. + ``` + gpu.module @mymodule [#gpu.amdgpu] { + ... + } + ``` + }]; + let parameters = (ins + DefaultValuedParameter<"int", "2", "Optimization level to apply.">:$O, + StringRefParameter<"Target triple.", "\"amdgcn-amd-amdhsa\"">:$triple, + StringRefParameter<"Target chip.", "\"gfx900\"">:$chip, + StringRefParameter<"Target chip features.", "\"\"">:$features, + StringRefParameter<"ABI version.", "\"500\"">:$abi, + OptionalParameter<"DictionaryAttr", "Target specific flags.">:$flags, + OptionalParameter<"ArrayAttr", "Files to link to the LLVM module.">:$link + ); + let assemblyFormat = [{ + (`<` struct($O, $triple, $chip, $features, $abi, $flags)^ `>`)? + }]; + let builders = [ + AttrBuilder<(ins CArg<"int", "2">:$optLevel, + CArg<"StringRef", "\"amdgcn-amd-amdhsa\"">:$triple, + CArg<"StringRef", "\"gfx900\"">:$chip, + CArg<"StringRef", "\"\"">:$features, + CArg<"StringRef", "\"500\"">:$abiVersion, + CArg<"DictionaryAttr", "nullptr">:$targetFlags, + CArg<"ArrayAttr", "nullptr">:$linkFiles), [{ + return Base::get($_ctxt, optLevel, triple, chip, features, abiVersion, + targetFlags, linkFiles); + }]> + ]; + let skipDefaultBuilders = 1; + let genVerifyDecl = 1; + let extraClassDeclaration = [{ + bool hasFlag(StringRef flag) const; + bool getWave64() const; + bool getFastMath() const; + bool getDaz() const; + bool getFiniteOnly() const; + bool getUnsafeMath() const; + bool getCorrectSqrt() const; + }]; + let extraClassDefinition = [{ + bool $cppClass::hasFlag(StringRef flag) const { + if (DictionaryAttr flags = getFlags()) + return flags.get(flag) != nullptr; + return false; + } + bool $cppClass::getWave64() const { + return hasFlag("wave64") || !hasFlag("no_wave64"); + } + bool $cppClass::getFastMath() const { + return hasFlag("fast"); + } + bool $cppClass::getDaz() const { + return hasFlag("daz"); + } + bool $cppClass::getFiniteOnly() const { + return hasFlag("finite_only"); + } + bool $cppClass::getUnsafeMath() const { + return hasFlag("unsafe_math"); + } + bool $cppClass::getCorrectSqrt() const { + return !hasFlag("unsafe_sqrt"); + } + }]; +} + #endif // GPU_COMPILATIONATTR diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -48,6 +48,7 @@ ) add_mlir_dialect_library(MLIRGPUTargets + Targets/AMDGPUTarget.cpp Targets/NVPTXTarget.cpp ADDITIONAL_HEADER_DIRS @@ -58,6 +59,7 @@ MC Target ${NVPTX_LIBS} + ${AMDGPU_LIBS} LINK_LIBS PUBLIC MLIRIR @@ -191,13 +193,32 @@ "Building mlir with ROCm support requires the AMDGPU backend") endif() - set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs") + if (DEFINED ROCM_PATH) + set(DEFAULT_ROCM_PATH "${ROCM_PATH}" CACHE PATH "Fallback path to search for ROCm installs") + elseif(DEFINED ENV{ROCM_PATH}) + set(DEFAULT_ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Fallback path to search for ROCm installs") + else() + set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs") + endif() + message(VERBOSE "MLIR Default ROCM toolkit path: ${DEFAULT_ROCM_PATH}") + target_compile_definitions(obj.MLIRGPUTransforms PRIVATE __DEFAULT_ROCM_PATH__="${DEFAULT_ROCM_PATH}" MLIR_GPU_TO_HSACO_PASS_ENABLE=1 ) + # Enable the gpu to amdgpu target. + target_compile_definitions(obj.MLIRGPUTargets + PRIVATE + MLIR_GPU_AMDGPU_TARGET_ENABLED=1 + __DEFAULT_ROCM_PATH__="${DEFAULT_ROCM_PATH}" + ) + target_compile_definitions(obj.MLIRGPUTransforms + PRIVATE + MLIR_GPU_AMDGPU_TARGET_ENABLED=1 + ) + target_link_libraries(MLIRGPUTransforms PRIVATE MLIRROCDLToLLVMIRTranslation diff --git a/mlir/lib/Dialect/GPU/Targets/AMDGPUTarget.cpp b/mlir/lib/Dialect/GPU/Targets/AMDGPUTarget.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Targets/AMDGPUTarget.cpp @@ -0,0 +1,406 @@ +//===- AMDGPUTarget.cpp - MLIR GPU Dialect AMDGPU target attribute --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This files implements the AMDGPU target attribute. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" + +using namespace mlir; +using namespace mlir::gpu; + +#ifdef MLIR_GPU_AMDGPU_TARGET_ENABLED +#include "mlir/ExecutionEngine/ModuleToObject.h" +#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Export.h" + +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/TargetParser/TargetParser.h" + +#ifndef __DEFAULT_ROCM_PATH__ +#define __DEFAULT_ROCM_PATH__ "" +#endif + +#define DEBUG_TYPE "serialize-to-object" + +namespace { +struct InitTarget { + InitTarget() { + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUAsmParser(); + LLVMInitializeAMDGPUAsmPrinter(); + } +}; + +class SerializeToHSA : public ModuleToObject { +public: + SerializeToHSA(Operation &module, AMDGPUTargetAttr target, + TargetOptions targetOptions = {}); + + // Init the target. + static void init(); + + // Get the paths of ROCm device libraries. Function adapted from: + // https://github.com/llvm/llvm-project/blob/main/clang/lib/Driver/ToolChains/AMDGPU.cpp + void getCommonBitcodeLibs(llvm::SmallVector &libs, + SmallVector &libPath, + StringRef isaVersion, bool wave64, bool daz, + bool finiteOnly, bool unsafeMath, bool fastMath, + bool correctSqrt, StringRef abiVer); + + // Removes unnecessary metadata from the loaded bitcode files. + void handleBitcodeFile(llvm::Module &module, + llvm::TargetMachine &targetMachine) override; + // Assembles the object. + std::optional> assembleIsa(StringRef isa); + + // Create the HSACO object. + std::optional> createHsaco(SmallVector &&ptx); + + std::optional>> + loadBitcodeFiles(llvm::LLVMContext &context, llvm::Module &module) override; + + std::optional> + moduleToObject(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) override; + +private: + AMDGPUTargetAttr target; + StringRef toolkitPath; + SmallVector fileList; +}; +} // namespace + +SerializeToHSA::SerializeToHSA(Operation &module, AMDGPUTargetAttr target, + TargetOptions targetOptions) + : ModuleToObject(module, target.getTriple(), target.getChip(), + target.getFeatures(), target.getO()), + target(target), toolkitPath(targetOptions.getToolkitPath()), + fileList(targetOptions.getBitcodeFiles()) { + if (toolkitPath.empty()) + toolkitPath = __DEFAULT_ROCM_PATH__; + + if (ArrayAttr files = target.getLink()) + for (Attribute attr : files.getValue()) + if (auto file = dyn_cast(attr)) + fileList.push_back(file.str()); +} + +void SerializeToHSA::init() { static InitTarget target = InitTarget(); } + +void SerializeToHSA::getCommonBitcodeLibs(llvm::SmallVector &libs, + SmallVector &libPath, + StringRef isaVersion, bool wave64, + bool daz, bool finiteOnly, + bool unsafeMath, bool fastMath, + bool correctSqrt, StringRef abiVer) { + auto addLib = [&](StringRef path) { + if (!llvm::sys::fs::is_regular_file(path)) { + getOperation().emitRemark() << "Bitcode library path: " << path + << " does not exist or is not a file.\n"; + return; + } + libs.push_back(path.str()); + }; + auto optLib = [](StringRef name, bool on) -> Twine { + return name + (on ? "_on" : "_off"); + }; + auto getLibPath = [&libPath](Twine lib) { + auto baseSize = libPath.size(); + llvm::sys::path::append(libPath, lib + ".bc"); + std::string path(StringRef(libPath.data(), libPath.size()).str()); + libPath.truncate(baseSize); + return path; + }; + + // Add ROCm device libraries. + addLib(getLibPath("ocml")); + addLib(getLibPath("ockl")); + addLib(getLibPath(optLib("oclc_daz_opt", daz))); + addLib(getLibPath(optLib("oclc_unsafe_math", unsafeMath || fastMath))); + addLib(getLibPath(optLib("oclc_finite_only", finiteOnly || fastMath))); + addLib(getLibPath(optLib("oclc_correctly_rounded_sqrt", correctSqrt))); + addLib(getLibPath(optLib("oclc_wavefrontsize64", wave64))); + addLib(getLibPath("oclc_isa_version_" + isaVersion)); + if (abiVer.size()) + addLib(getLibPath("oclc_abi_version_" + abiVer)); +} + +std::optional>> +SerializeToHSA::loadBitcodeFiles(llvm::LLVMContext &context, + llvm::Module &module) { + // Try loading device libraries from the ROCm toolkit installation. + StringRef pathRef = toolkitPath; + if (pathRef.size()) { + SmallVector path; + path.insert(path.begin(), pathRef.begin(), pathRef.end()); + llvm::sys::path::append(path, "amdgcn", "bitcode"); + pathRef = StringRef(path.data(), path.size()); + if (!llvm::sys::fs::is_directory(pathRef)) { + getOperation().emitRemark() << "ROCm amdgcn bitcode path: " << pathRef + << " does not exist or is not a directory."; + return std::nullopt; + } + StringRef isaVersion = + llvm::AMDGPU::getArchNameAMDGCN(llvm::AMDGPU::parseArchAMDGCN(chip)); + isaVersion.consume_front("gfx"); + getCommonBitcodeLibs(fileList, path, isaVersion, target.getWave64(), + target.getDaz(), target.getFiniteOnly(), + target.getUnsafeMath(), target.getFastMath(), + target.getCorrectSqrt(), target.getAbi()); + } + + SmallVector> bcFiles; + if (failed(loadBitcodeFilesFromList(context, fileList, bcFiles, true))) + return std::nullopt; + return bcFiles; +} + +void SerializeToHSA::handleBitcodeFile(llvm::Module &module, + llvm::TargetMachine &targetMachine) { + // Some ROCM builds don't strip this like they should + if (auto *openclVersion = module.getNamedMetadata("opencl.ocl.version")) + module.eraseNamedMetadata(openclVersion); + // Stop spamming us with clang version numbers + if (auto *ident = module.getNamedMetadata("llvm.ident")) + module.eraseNamedMetadata(ident); +} + +//===----------------------------------------------------------------------===// +// AMDGPU pipeline methods. +//===----------------------------------------------------------------------===// +#include "mlir/Support/FileUtilities.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCParser/MCTargetAsmParser.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/Program.h" + +std::optional> SerializeToHSA::assembleIsa(StringRef isa) { + auto loc = getOperation().getLoc(); + + StringRef targetTriple = this->triple; + + SmallVector result; + llvm::raw_svector_ostream os(result); + + llvm::Triple triple(llvm::Triple::normalize(targetTriple)); + std::string error; + const llvm::Target *target = + llvm::TargetRegistry::lookupTarget(triple.normalize(), error); + if (!target) { + emitError(loc, Twine("failed to lookup target: ") + error); + return std::nullopt; + } + + llvm::SourceMgr srcMgr; + srcMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(isa), SMLoc()); + + const llvm::MCTargetOptions mcOptions; + std::unique_ptr mri( + target->createMCRegInfo(targetTriple)); + std::unique_ptr mai( + target->createMCAsmInfo(*mri, targetTriple, mcOptions)); + mai->setRelaxELFRelocations(true); + std::unique_ptr sti( + target->createMCSubtargetInfo(targetTriple, chip, features)); + + llvm::MCContext ctx(triple, mai.get(), mri.get(), sti.get(), &srcMgr, + &mcOptions); + std::unique_ptr mofi(target->createMCObjectFileInfo( + ctx, /*PIC=*/false, /*LargeCodeModel=*/false)); + ctx.setObjectFileInfo(mofi.get()); + + SmallString<128> cwd; + if (!llvm::sys::fs::current_path(cwd)) + ctx.setCompilationDir(cwd); + + std::unique_ptr mcStreamer; + std::unique_ptr mcii(target->createMCInstrInfo()); + + llvm::MCCodeEmitter *ce = target->createMCCodeEmitter(*mcii, ctx); + llvm::MCAsmBackend *mab = target->createMCAsmBackend(*sti, *mri, mcOptions); + mcStreamer.reset(target->createMCObjectStreamer( + triple, ctx, std::unique_ptr(mab), + mab->createObjectWriter(os), std::unique_ptr(ce), + *sti, mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible, + /*DWARFMustBeAtTheEnd*/ false)); + mcStreamer->setUseAssemblerInfoForParsing(true); + + std::unique_ptr parser( + createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai)); + std::unique_ptr tap( + target->createMCAsmParser(*sti, *parser, *mcii, mcOptions)); + + if (!tap) { + emitError(loc, "assembler initialization error"); + return {}; + } + + parser->setTargetParser(*tap); + parser->Run(false); + + return result; +} + +std::optional> +SerializeToHSA::createHsaco(SmallVector &&ptx) { + SmallVector isaBinary = std::move(ptx); + auto loc = getOperation().getLoc(); + + // Save the ISA binary to a temp file. + int tempIsaBinaryFd = -1; + SmallString<128> tempIsaBinaryFilename; + if (llvm::sys::fs::createTemporaryFile("kernel", "o", tempIsaBinaryFd, + tempIsaBinaryFilename)) { + emitError(loc, "temporary file for ISA binary creation error"); + return {}; + } + llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename); + llvm::raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true); + tempIsaBinaryOs << StringRef(isaBinary.data(), isaBinary.size()); + tempIsaBinaryOs.close(); + + // Create a temp file for HSA code object. + int tempHsacoFD = -1; + SmallString<128> tempHsacoFilename; + if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco", tempHsacoFD, + tempHsacoFilename)) { + emitError(loc, "temporary file for HSA code object creation error"); + return {}; + } + llvm::FileRemover cleanupHsaco(tempHsacoFilename); + + llvm::SmallString<32> lldPath(toolkitPath); + llvm::sys::path::append(lldPath, "llvm", "bin", "ld.lld"); + int lldResult = llvm::sys::ExecuteAndWait( + lldPath, + {"ld.lld", "-shared", tempIsaBinaryFilename, "-o", tempHsacoFilename}); + if (lldResult != 0) { + emitError(loc, "lld invocation error"); + return {}; + } + + // Load the HSA code object. + auto hsacoFile = openInputFile(tempHsacoFilename); + if (!hsacoFile) { + emitError(loc, "read HSA code object from temp file error"); + return {}; + } + + StringRef buffer = hsacoFile->getBuffer(); + + return SmallVector(buffer.begin(), buffer.end()); +} + +std::optional> +SerializeToHSA::moduleToObject(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) { + std::optional serializedISA = + translateToISA(llvmModule, targetMachine); + if (!serializedISA) { + getOperation().emitError() << "Failed translating the module to ISA."; + return std::nullopt; + } + + LLVM_DEBUG({ + llvm::dbgs() << "ISA for module: " + << dyn_cast(&getOperation()).getNameAttr() + << "\n"; + llvm::dbgs() << *serializedISA << "\n"; + llvm::dbgs().flush(); + }); + + std::optional> assembledIsa = + assembleIsa(serializedISA.value()); + + if (!assembledIsa) { + getOperation().emitError() << "Failed during ISA assembling."; + return std::nullopt; + } + + return createHsaco(std::move(assembledIsa.value())); +} + +std::optional> +AMDGPUTargetAttr::serializeToObject(Operation *module, + const TargetOptions &options) const { + assert(module && "The module must be non null."); + if (!module) + return std::nullopt; + if (!mlir::isa(module)) { + module->emitError("Module must be a GPU module."); + return std::nullopt; + } + SerializeToHSA::init(); + SerializeToHSA serializer(*module, *this, options); + return serializer.run(); +} + +#else +// Provide a null vector for testing purposes. +std::optional> +AMDGPUTargetAttr::serializeToObject(Operation *module, + const TargetOptions &options) const { + assert(module && "The module must be non null."); + if (!module) + return std::nullopt; + if (!mlir::isa(module)) { + module->emitError("Module must be a GPU module."); + return std::nullopt; + } + return SmallVector{}; +} +#endif // MLIR_GPU_AMDGPU_TARGET_ENABLED + +LogicalResult +AMDGPUTargetAttr::verify(function_ref emitError, + int optLevel, StringRef triple, StringRef chip, + StringRef features, StringRef abiVersion, + DictionaryAttr flags, ArrayAttr files) { + if (optLevel < 0 || optLevel > 3) { + emitError() << "The optimization level must be a number between 0 and 3."; + return failure(); + } + if (triple.empty()) { + emitError() << "The target triple cannot be empty."; + return failure(); + } + if (chip.empty()) { + emitError() << "The target chip cannot be empty."; + return failure(); + } + if (abiVersion != "400" && abiVersion != "500") { + emitError() << "Invalid ABI version, it must be either `400` or `500`."; + return failure(); + } + if (files && llvm::all_of(files, [](::mlir::Attribute attr) { + return attr && mlir::isa(attr); + })) { + emitError() << "All the elements in the `link` array must be strings."; + return failure(); + } + return success(); +}