diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h --- a/mlir/include/mlir/Dialect/GPU/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Passes.h @@ -54,14 +54,23 @@ protected: void getDependentDialects(DialectRegistry ®istry) const override; -private: - /// Creates the LLVM target machine to generate the ISA. - std::unique_ptr createTargetMachine(); + /// Translates the module to ISA + virtual Optional + translateToISA(llvm::Module &llvmModule, llvm::TargetMachine &targetMachine); + + /// Hook allowing the application of optimizations before codegen + /// By default, does nothing + virtual LogicalResult optimizeLlvm(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine); /// Translates the 'getOperation()' result to an LLVM module. virtual std::unique_ptr translateToLLVMIR(llvm::LLVMContext &llvmContext); +private: + /// Creates the LLVM target machine to generate the ISA. + std::unique_ptr createTargetMachine(); + /// Serializes the target ISA to binary form. virtual std::unique_ptr> serializeISA(const std::string &isa) = 0; diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -8,11 +8,14 @@ if (MLIR_ENABLE_ROCM_CONVERSIONS) set(AMDGPU_LIBS + IRReader + linker MCParser AMDGPUAsmParser AMDGPUCodeGen AMDGPUDesc AMDGPUInfo + target ) endif() @@ -126,40 +129,22 @@ message(SEND_ERROR "lld is not enabled. Please revise LLVM_ENABLE_PROJECTS") endif() - # Configure ROCm support. - if (NOT DEFINED ROCM_PATH) - if (NOT DEFINED ENV{ROCM_PATH}) - set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed") - else() - set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed") - endif() - set(HIP_PATH "${ROCM_PATH}/hip" CACHE PATH " Path to which HIP has been installed") - endif() - set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) - find_package(HIP) - if (NOT HIP_FOUND) - message(SEND_ERROR "Building mlir with ROCm support requires a working ROCm and HIP install") - else() - message(STATUS "ROCm HIP version: ${HIP_VERSION}") - endif() - + set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs") target_compile_definitions(obj.MLIRGPUOps PRIVATE - __HIP_PLATFORM_HCC__ - __ROCM_PATH__="${ROCM_PATH}" + __DEFAULT_ROCM_PATH__="${DEFAULT_ROCM_PATH}" MLIR_GPU_TO_HSACO_PASS_ENABLE=1 ) target_include_directories(obj.MLIRGPUOps PRIVATE ${MLIR_SOURCE_DIR}/../lld/include - ${HIP_PATH}/include - ${ROCM_PATH}/include ) target_link_libraries(MLIRGPUOps PRIVATE lldELF + MLIRExecutionEngine MLIRROCDLToLLVMIRTranslation ) diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp --- a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp @@ -31,18 +31,28 @@ gpu::SerializeToBlobPass::SerializeToBlobPass(const SerializeToBlobPass &other) : OperationPass(other) {} -static std::string translateToISA(llvm::Module &llvmModule, - llvm::TargetMachine &targetMachine) { +Optional +gpu::SerializeToBlobPass::translateToISA(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) { llvmModule.setDataLayout(targetMachine.createDataLayout()); + if (failed(optimizeLlvm(llvmModule, targetMachine))) { + return llvm::None; + } std::string targetISA; llvm::raw_string_ostream stream(targetISA); - llvm::buffer_ostream pstream(stream); + llvm::legacy::PassManager codegenPasses; - targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr, - llvm::CGFT_AssemblyFile); - codegenPasses.run(llvmModule); - return targetISA; + + { // Drop pstream after this to prevent the ISA from being stuck buffering + llvm::buffer_ostream pstream(stream); + if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr, + llvm::CGFT_AssemblyFile)) { + return llvm::None; + } + codegenPasses.run(llvmModule); + } + return stream.str(); } void gpu::SerializeToBlobPass::runOnOperation() { @@ -58,7 +68,13 @@ if (!targetMachine) return signalPassFailure(); - std::string targetISA = translateToISA(*llvmModule, *targetMachine); + Optional maybeTargetISA = + translateToISA(*llvmModule, *targetMachine); + + if (!maybeTargetISA.hasValue()) { + return signalPassFailure(); + } + std::string targetISA = maybeTargetISA.getValue(); // Serialize the target ISA. std::unique_ptr> blob = serializeISA(targetISA); @@ -71,6 +87,14 @@ getOperation()->setAttr(gpuBinaryAnnotation, attr); } +LogicalResult +gpu::SerializeToBlobPass::optimizeLlvm(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) { + // TODO: If serializeToCubin ends up defining optimizations, factor them + // into here from SerializeToHsaco + return success(); +} + void gpu::SerializeToBlobPass::getDependentDialects( DialectRegistry ®istry) const { registerLLVMDialectTranslation(registry); diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp --- a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp @@ -13,11 +13,17 @@ #include "mlir/Dialect/GPU/Passes.h" #if MLIR_GPU_TO_HSACO_PASS_ENABLE +#include "mlir/ExecutionEngine/OptUtils.h" #include "mlir/Pass/Pass.h" #include "mlir/Support/FileUtilities.h" #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Export.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" + #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCCodeEmitter.h" @@ -27,19 +33,19 @@ #include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" - #include "llvm/MC/TargetRegistry.h" + +#include "llvm/Support/CommandLine.h" #include "llvm/Support/FileUtilities.h" -#include "llvm/Support/LineIterator.h" #include "llvm/Support/Program.h" +#include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Support/WithColor.h" + #include "llvm/Target/TargetOptions.h" #include "lld/Common/Driver.h" -#include "hip/hip_version.h" - #include using namespace mlir; @@ -48,13 +54,36 @@ class SerializeToHsacoPass : public PassWrapper { public: + // Needed to make options work SerializeToHsacoPass(); + SerializeToHsacoPass(const SerializeToHsacoPass &other) { + if (other.triple.hasValue()) { + this->triple = other.triple; + } + if (other.chip.hasValue()) { + this->chip = other.chip; + } + if (other.features.hasValue()) { + this->features = other.features; + } + if (other.rocmPath.hasValue()) { + this->rocmPath = other.rocmPath; + } + this->optLevel = other.optLevel; + }; + + SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features, + int optLevel); StringRef getArgument() const override { return "gpu-to-hsaco"; } StringRef getDescription() const override { return "Lower GPU kernel function to HSACO binary annotations"; } +protected: + Option rocmPath{*this, "rocm-path", + llvm::cl::desc("Path to ROCm install")}; + private: void getDependentDialects(DialectRegistry ®istry) const override; @@ -62,67 +91,35 @@ std::unique_ptr> serializeISA(const std::string &isa) override; + // Overload to allow linking in device libs + std::unique_ptr + translateToLLVMIR(llvm::LLVMContext &llvmContext) override; + + /// Adds LLVM optimization passes + LogicalResult optimizeLlvm(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) override; + std::unique_ptr> assembleIsa(const std::string &isa); std::unique_ptr> createHsaco(const SmallVectorImpl &isaBinary); -}; -} // namespace - -static std::string getDefaultChip() { - const char kDefaultChip[] = "gfx900"; - // Locate rocm_agent_enumerator. - const char kRocmAgentEnumerator[] = "rocm_agent_enumerator"; - llvm::ErrorOr rocmAgentEnumerator = llvm::sys::findProgramByName( - kRocmAgentEnumerator, {__ROCM_PATH__ "/bin"}); - if (!rocmAgentEnumerator) { - llvm::WithColor::warning(llvm::errs()) - << kRocmAgentEnumerator << "couldn't be located under " << __ROCM_PATH__ - << "/bin\n"; - return kDefaultChip; - } + std::string getRocmPath(); - // Prepare temp file to hold the outputs. - int tempFd = -1; - SmallString<128> tempFilename; - if (llvm::sys::fs::createTemporaryFile("rocm_agent", "txt", tempFd, - tempFilename)) { - llvm::WithColor::warning(llvm::errs()) - << "temporary file for " << kRocmAgentEnumerator << " creation error\n"; - return kDefaultChip; - } - llvm::FileRemover cleanup(tempFilename); - - // Invoke rocm_agent_enumerator. - std::string errorMessage; - SmallVector args{"-t", "GPU"}; - Optional redirects[3] = {{""}, tempFilename.str(), {""}}; - int result = - llvm::sys::ExecuteAndWait(rocmAgentEnumerator.get(), args, llvm::None, - redirects, 0, 0, &errorMessage); - if (result) { - llvm::WithColor::warning(llvm::errs()) - << kRocmAgentEnumerator << " invocation error: " << errorMessage - << "\n"; - return kDefaultChip; - } - - // Load and parse the result. - auto gfxIsaList = openInputFile(tempFilename); - if (!gfxIsaList) { - llvm::WithColor::error(llvm::errs()) - << "read ROCm agent list temp file error\n"; - return kDefaultChip; + int optLevel; +}; +} // end namespace + +/// Get a user-specified path to ROCm +// Tries, in order, the --rocm-path option, the ROCM_PATH environment variable +// and a compile-time default +std::string SerializeToHsacoPass::getRocmPath() { + if (rocmPath.getNumOccurrences() > 0) { + return rocmPath.getValue(); } - for (llvm::line_iterator lines(*gfxIsaList); !lines.is_at_end(); ++lines) { - // Skip the line with content "gfx000". - if (*lines == "gfx000") - continue; - // Use the first ISA version found. - return lines->str(); + if (auto env = llvm::sys::Process::GetEnv("ROCM_PATH")) { + return env.getValue(); } - - return kDefaultChip; + return __DEFAULT_ROCM_PATH__; } // Sets the 'option' to 'value' unless it already has a value. @@ -132,12 +129,12 @@ option = getValue(); } -SerializeToHsacoPass::SerializeToHsacoPass() { - maybeSetOption(this->triple, [] { return "amdgcn-amd-amdhsa"; }); - maybeSetOption(this->chip, [] { - static auto chip = getDefaultChip(); - return chip; - }); +SerializeToHsacoPass::SerializeToHsacoPass(StringRef triple, StringRef arch, + StringRef features, int optLevel) + : optLevel(optLevel) { + maybeSetOption(this->triple, [&triple] { return triple.str(); }); + maybeSetOption(this->chip, [&arch] { return arch.str(); }); + maybeSetOption(this->features, [&features] { return features.str(); }); } void SerializeToHsacoPass::getDependentDialects( @@ -146,6 +143,208 @@ gpu::SerializeToBlobPass::getDependentDialects(registry); } +static Optional, 3>> +loadLibraries(SmallVectorImpl &path, + SmallVectorImpl &libraries, + llvm::LLVMContext &context) { + SmallVector, 3> ret; + auto dirLength = path.size(); + + if (!llvm::sys::fs::is_directory(path)) { + llvm::dbgs() << "Bitcode path: " << path + << " does not exist or is not a directory\n"; + return llvm::None; + } + + for (const auto &file : libraries) { + llvm::SMDiagnostic error; + llvm::sys::path::append(path, file); + llvm::StringRef pathRef(path.data(), path.size()); + std::unique_ptr library = + llvm::getLazyIRFileModule(pathRef, error, context); + path.set_size(dirLength); + if (!library) { + llvm::dbgs() << "Failed to load library " << file << " from " << path; + error.print("[MLIR backend]", llvm::dbgs()); + return llvm::None; + } + // Some ROCM builds don't strip this like they should + if (auto *openclVersion = library->getNamedMetadata("opencl.ocl.version")) { + library->eraseNamedMetadata(openclVersion); + } + // Stop spamming us with clang version numbers + if (auto *ident = library->getNamedMetadata("llvm.ident")) { + library->eraseNamedMetadata(ident); + } + ret.push_back(std::move(library)); + } + + return ret; +} + +std::unique_ptr +SerializeToHsacoPass::translateToLLVMIR(llvm::LLVMContext &llvmContext) { + // MLIR -> LLVM translation + std::unique_ptr ret = + gpu::SerializeToBlobPass::translateToLLVMIR(llvmContext); + + if (!ret) { + llvm::dbgs() << "Module creation failed"; + return ret; + } + // Walk the LLVM module in order to determine if we need to link in device + // libs + bool needOpenCl = false; + bool needOckl = false; + bool needOcml = false; + for (auto &f : ret->functions()) { + if (f.hasExternalLinkage() && f.hasName() && !f.hasExactDefinition()) { + StringRef funcName = f.getName(); + if ("printf" == funcName) { + needOpenCl = true; + } + if (funcName.startswith("__ockl_")) { + needOckl = true; + } + if (funcName.startswith("__ocml_")) { + needOcml = true; + } + } + } + + if (needOpenCl) { + needOcml = needOckl = true; + } + + // No libraries needed (the typical case) + if (!(needOpenCl || needOcml || needOckl)) { + return ret; + } + + auto addControlConstant = [&module = *ret](StringRef name, uint32_t value, + uint32_t bitwidth) { + using llvm::GlobalVariable; + if (module.getNamedGlobal(name)) { + return; + } + llvm::IntegerType *type = + llvm::IntegerType::getIntNTy(module.getContext(), bitwidth); + auto *initializer = llvm::ConstantInt::get(type, value, /*isSigned=*/false); + auto *constant = new GlobalVariable( + module, type, + /*isConstant=*/true, GlobalVariable::LinkageTypes::LinkOnceODRLinkage, + initializer, name, + /*before=*/nullptr, + /*threadLocalMode=*/GlobalVariable::ThreadLocalMode::NotThreadLocal, + /*addressSpace=*/4); + constant->setUnnamedAddr(GlobalVariable::UnnamedAddr::Local); + constant->setVisibility( + GlobalVariable::VisibilityTypes::ProtectedVisibility); + constant->setAlignment(llvm::MaybeAlign(bitwidth / 8)); + }; + + // Set up control variables in the module instead of linking in tiny bitcode + if (needOcml) { + // TODO(kdrewnia): Enable math optimizations once we have support for + // `-ffast-math`-like options + addControlConstant("__oclc_finite_only_opt", 0, 8); + addControlConstant("__oclc_daz_opt", 0, 8); + addControlConstant("__oclc_correctly_rounded_sqrt32", 1, 8); + addControlConstant("__oclc_unsafe_math_opt", 0, 8); + } + if (needOcml || needOckl) { + addControlConstant("__oclc_wavefrontsize64", 1, 8); + StringRef chipSet = this->chip.getValue(); + if (chipSet.startswith("gfx")) { + chipSet = chipSet.substr(3); + } + uint32_t minor = + llvm::APInt(32, chipSet.substr(chipSet.size() - 2), 16).getZExtValue(); + uint32_t major = llvm::APInt(32, chipSet.substr(0, chipSet.size() - 2), 10) + .getZExtValue(); + uint32_t isaNumber = minor + 1000 * major; + addControlConstant("__oclc_ISA_version", isaNumber, 32); + } + + // Determine libraries we need to link + llvm::SmallVector libraries; + if (needOpenCl) { + libraries.push_back("opencl.bc"); + } + if (needOcml) { + libraries.push_back("ocml.bc"); + } + if (needOckl) { + libraries.push_back("ockl.bc"); + } + + Optional, 3>> mbModules; + auto theRocmPath = getRocmPath(); + llvm::SmallString<32> bitcodePath(theRocmPath); + llvm::sys::path::append(bitcodePath, "amdgcn", "bitcode"); + mbModules = loadLibraries(bitcodePath, libraries, llvmContext); + + // Handle legacy override variable + auto env = llvm::sys::Process::GetEnv("HIP_DEVICE_LIB_PATH"); + if (env && (rocmPath.getNumOccurrences() == 0)) { + llvm::SmallString<32> overrideValue(env.getValue()); + auto mbAtOldPath = loadLibraries(overrideValue, libraries, llvmContext); + if (mbAtOldPath) { + mbModules = std::move(mbAtOldPath); + } + } + + if (!mbModules) { + llvm::WithColor::warning(llvm::errs()) + << "Warning: Could not load required device labraries\n"; + llvm::WithColor::note(llvm::errs()) + << "Note: this will probably cause link-time or run-time failures\n"; + return ret; // We can still abort here + } + + llvm::Linker linker(*ret); + for (auto &libModule : mbModules.getValue()) { + // Failure is true + auto err = linker.linkInModule( + std::move(libModule), llvm::Linker::Flags::LinkOnlyNeeded, + [](llvm::Module &m, const StringSet<> &gvs) { + llvm::internalizeModule(m, [&gvs](const llvm::GlobalValue &gv) { + return !gv.hasName() || (gvs.count(gv.getName()) == 0); + }); + }); + if (err) { + llvm::errs() << "Error: Failure in library bitcode linking\n"; + // We have no guaranties about the state of `ret`, so bail + return nullptr; + } + } + return ret; +} + +LogicalResult +SerializeToHsacoPass::optimizeLlvm(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) { + if (optLevel < 0 || optLevel > 3) { + llvm::errs() << "Invalid optimization level passed to SerializeToHsaco: " + << optLevel << "\n"; + return failure(); + } + targetMachine.setOptLevel(static_cast(optLevel)); + + auto transformer = + makeOptimizingTransformer(optLevel, /*sizeLevel=*/0, &targetMachine); + auto error = transformer(&llvmModule); + if (error) { + llvm::handleAllErrors(std::move(error), [](const llvm::ErrorInfoBase &ei) { + llvm::errs() << "Could not optimize LLVM IR: "; + ei.log(llvm::errs()); + llvm::errs() << "\n"; + }); + return failure(); + } + return success(); +} + std::unique_ptr> SerializeToHsacoPass::assembleIsa(const std::string &isa) { auto loc = getOperation().getLoc(); @@ -281,7 +480,10 @@ LLVMInitializeAMDGPUTargetInfo(); LLVMInitializeAMDGPUTargetMC(); - return std::make_unique(); + // Known-bad values for constructor arguments since the instance of the + // pass that's registered here will never be used directly + return std::make_unique( + "", "[GARBAGE]", "+veryfake,-cantsurface", -1); }); } #else // MLIR_GPU_TO_HSACO_PASS_ENABLE