diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt --- a/mlir/CMakeLists.txt +++ b/mlir/CMakeLists.txt @@ -31,6 +31,15 @@ # TODO: we should use a config.h file like LLVM does add_definitions(-DMLIR_CUDA_CONVERSIONS_ENABLED=${MLIR_CUDA_CONVERSIONS_ENABLED}) +# Build the ROCm conversions and run according tests if the AMDGPU backend +# is available +if ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD) + set(MLIR_ROCM_CONVERSIONS_ENABLED 1) +else() + set(MLIR_ROCM_CONVERSIONS_ENABLED 0) +endif() +add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ROCM_CONVERSIONS_ENABLED}) + set(MLIR_CUDA_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir CUDA runner") set(MLIR_VULKAN_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir Vulkan runner") diff --git a/mlir/include/mlir/Conversion/GPUToROCm/GPUToROCmPass.h b/mlir/include/mlir/Conversion/GPUToROCm/GPUToROCmPass.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Conversion/GPUToROCm/GPUToROCmPass.h @@ -0,0 +1,52 @@ +//===- GPUToROCmPass.h - MLIR ROCm runtime support --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef MLIR_CONVERSION_GPUTOROCM_GPUTOROCMPASS_H_ +#define MLIR_CONVERSION_GPUTOROCM_GPUTOROCMPASS_H_ + +#include "mlir/Support/LLVM.h" +#include +#include +#include +#include + +namespace mlir { + +class Location; +class ModuleOp; + +template +class OperationPass; + +namespace gpu { +class GPUModuleOp; +} // namespace gpu + +namespace LLVM { +class LLVMDialect; +} // namespace LLVM + +using OwnedHsaco = std::unique_ptr>; +using HsacoGenerator = + std::function; + +/// Creates a pass to convert kernel functions into HSA code object blobs. +/// +/// This transformation takes the body of each function that is annotated with +/// the 'gpu.kernel' attribute, copies it to a new LLVM module, compiles the +/// module with help of the AMDGPU backend to HSA code object and then invokes +/// the provided hsacoGenerator to produce a binary blob (the hsaco). Such blob +/// is then attached as a string attribute named 'rocdl.hsaco' to the kernel +/// function. +/// After the transformation, the body of the kernel function is removed (i.e., +/// it is turned into a declaration). +std::unique_ptr> +createConvertGPUKernelToHsacoPass(HsacoGenerator hsacoGenerator); + +} // namespace mlir + +#endif // MLIR_CONVERSION_GPUTOROCM_GPUTOROCMPASS_H_ diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h --- a/mlir/include/mlir/InitAllPasses.h +++ b/mlir/include/mlir/InitAllPasses.h @@ -18,6 +18,7 @@ #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" +#include "mlir/Conversion/GPUToROCm/GPUToROCmPass.h" #include "mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h" #include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h" #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h" diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -3,6 +3,7 @@ add_subdirectory(GPUToCUDA) add_subdirectory(GPUToNVVM) add_subdirectory(GPUToROCDL) +add_subdirectory(GPUToROCm) add_subdirectory(GPUToSPIRV) add_subdirectory(GPUToVulkan) add_subdirectory(LinalgToLLVM) diff --git a/mlir/lib/Conversion/GPUToROCm/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCm/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/lib/Conversion/GPUToROCm/CMakeLists.txt @@ -0,0 +1,40 @@ +set(LLVM_OPTIONAL_SOURCES + ConvertKernelFuncToHsaco.cpp +) + +# TBD +# set(SOURCES +# ConvertLaunchFuncToROCmCalls.cpp +# ) + +if (MLIR_ROCM_CONVERSIONS_ENABLED) + list(APPEND SOURCES "ConvertKernelFuncToHsaco.cpp") + set(AMDGPU_LIBS + MC + AMDGPUCodeGen + AMDGPUDesc + AMDGPUInfo + ) + +endif() + +add_mlir_conversion_library(MLIRGPUtoROCmTransforms + ${SOURCES} + + DEPENDS + MLIRConversionPassIncGen + intrinsics_gen + + LINK_COMPONENTS + Core + ${AMDGPU_LIBS} + + LINK_LIBS PUBLIC + MLIRGPU + MLIRIR + MLIRLLVMIR + MLIRROCDLIR + MLIRPass + MLIRSupport + MLIRTargetROCDLIR +) diff --git a/mlir/lib/Conversion/GPUToROCm/ConvertKernelFuncToHsaco.cpp b/mlir/lib/Conversion/GPUToROCm/ConvertKernelFuncToHsaco.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Conversion/GPUToROCm/ConvertKernelFuncToHsaco.cpp @@ -0,0 +1,162 @@ +//===- ConvertKernelFuncToHsaco.cpp - MLIR GPU lowering passes ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to convert gpu kernel functions into a +// corresponding binary blob that can be executed on a ROCm GPU. Currently +// only translates the function itself but no dependencies. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/GPUToROCm/GPUToROCmPass.h" + +#include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Function.h" +#include "mlir/IR/Module.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassRegistry.h" +#include "mlir/Support/LogicalResult.h" +#include "mlir/Target/ROCDLIR.h" + +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/Twine.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/Mutex.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" + +using namespace mlir; + +namespace { +static constexpr const char *kHsacoAnnotation = "rocdl.hsaco"; + +/// A pass converting tagged kernel modules to hsaco blobs. +/// +/// If tagged as a kernel module, each contained function is translated to ROCDL +/// IR. A user provided HsacoGenerator compiles the IR to GPU binary code in HSA +/// code object format, which is then attached as an attribute to the function. +/// The function body is erased. +class GpuKernelToHsacoPass + : public PassWrapper> { +public: + GpuKernelToHsacoPass(HsacoGenerator hsacoGenerator) + : hsacoGenerator(hsacoGenerator) {} + + void runOnOperation() override { + gpu::GPUModuleOp module = getOperation(); + + // Lock access to the llvm context. + llvm::sys::SmartScopedLock scopedLock( + module.getContext() + ->getRegisteredDialect() + ->getLLVMContextMutex()); + + // Make sure the AMDGPU target is initialized. + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUAsmPrinter(); + + auto llvmModule = translateModuleToROCDLIR(module); + if (!llvmModule) + return signalPassFailure(); + + // Translate the module to HSA code object and attach the result as + // attribute to the module. + if (auto hsacoAttr = translateGPUModuleToHsacoAnnotation( + *llvmModule, module.getLoc(), module.getName())) + module.setAttr(kHsacoAnnotation, hsacoAttr); + else + signalPassFailure(); + } + +private: + std::string translateModuleToLLVM(llvm::Module &module, + llvm::TargetMachine &target_machine); + + /// Converts llvmModule to hsaco using the user-provided generator. Location + /// is used for error reporting and name is forwarded to the HSACO generator + /// to use in its logging mechanisms. + OwnedHsaco convertModuleToHsaco(llvm::Module &llvmModule, Location loc, + StringRef name); + + /// Translates llvmModule to hsaco and returns the result as attribute. + StringAttr translateGPUModuleToHsacoAnnotation(llvm::Module &llvmModule, + Location loc, StringRef name); + + HsacoGenerator hsacoGenerator; +}; + +} // anonymous namespace + +std::string GpuKernelToHsacoPass::translateModuleToLLVM( + llvm::Module &module, llvm::TargetMachine &target_machine) { + std::string llvmir; + { + // Clone the llvm module into a new context to enable concurrent compilation + // with multiple threads. + llvm::LLVMContext llvmContext; + auto clone = LLVM::cloneModuleIntoNewContext(&llvmContext, &module); + + llvm::raw_string_ostream stream(llvmir); + llvm::buffer_ostream pstream(stream); + llvm::legacy::PassManager codegen_passes; + target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr, + llvm::CGFT_AssemblyFile); + codegen_passes.run(*clone); + } + + return llvmir; +} + +OwnedHsaco GpuKernelToHsacoPass::convertModuleToHsaco(llvm::Module &llvmModule, + Location loc, + StringRef name) { + std::unique_ptr targetMachine; + { + std::string error; + constexpr const char *rocmTriple = "amdgcn-amd-amdhsa"; + llvm::Triple triple(rocmTriple); + const llvm::Target *target = + llvm::TargetRegistry::lookupTarget("", triple, error); + if (target == nullptr) { + emitError(loc, "cannot initialize target triple"); + return {}; + } + // TODO(whchung): be able to set mcpu. + targetMachine.reset( + target->createTargetMachine(triple.str(), "gfx900", "", {}, {})); + } + + // Set the data layout of the llvm module to match what the ptx target needs. + llvmModule.setDataLayout(targetMachine->createDataLayout()); + + auto ptx = translateModuleToLLVM(llvmModule, *targetMachine); + + return hsacoGenerator(ptx, loc, name); +} + +StringAttr GpuKernelToHsacoPass::translateGPUModuleToHsacoAnnotation( + llvm::Module &llvmModule, Location loc, StringRef name) { + auto hsaco = convertModuleToHsaco(llvmModule, loc, name); + if (!hsaco) + return {}; + return StringAttr::get({hsaco->data(), hsaco->size()}, loc->getContext()); +} + +std::unique_ptr> +mlir::createConvertGPUKernelToHsacoPass(HsacoGenerator hsacoGenerator) { + return std::make_unique(hsacoGenerator); +} diff --git a/mlir/test/Conversion/GPUToROCm/lit.local.cfg b/mlir/test/Conversion/GPUToROCm/lit.local.cfg new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUToROCm/lit.local.cfg @@ -0,0 +1,2 @@ +if not config.run_rocm_tests: + config.unsupported = True diff --git a/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir @@ -0,0 +1,26 @@ +// RUN: mlir-opt %s --test-kernel-to-hsaco -split-input-file | FileCheck %s + +// CHECK: attributes {rocdl.hsaco = "HSACO"} +gpu.module @foo { + llvm.func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">) + // CHECK: attributes {gpu.kernel} + attributes { gpu.kernel } { + llvm.return + } +} + +// ----- + +gpu.module @bar { + // CHECK: func @kernel_a + llvm.func @kernel_a() + attributes { gpu.kernel } { + llvm.return + } + + // CHECK: func @kernel_b + llvm.func @kernel_b() + attributes { gpu.kernel } { + llvm.return + } +} diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -5,6 +5,7 @@ TestCallGraph.cpp TestConstantFold.cpp TestConvertGPUKernelToCubin.cpp + TestConvertGPUKernelToHsaco.cpp TestDominance.cpp TestLoopFusion.cpp TestGpuMemoryPromotion.cpp @@ -37,6 +38,7 @@ MLIREDSC MLIRGPU MLIRGPUtoCUDATransforms + MLIRGPUtoROCmTransforms MLIRLinalgOps MLIRLinalgTransforms MLIRSCF diff --git a/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp b/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp new file mode 100644 --- /dev/null +++ b/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp @@ -0,0 +1,31 @@ +//===- TestConvertGPUKernelToHsaco.cpp - Test gpu kernel hsaco lowering ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/GPUToROCm/GPUToROCmPass.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +using namespace mlir; + +#if MLIR_ROCM_CONVERSIONS_ENABLED +static OwnedHsaco compileROCDLToHsacoForTesting(const std::string &, Location, + StringRef) { + const char data[] = "HSACO"; + return std::make_unique>(data, data + sizeof(data) - 1); +} + +namespace mlir { +void registerTestConvertGPUKernelToHsacoPass() { + PassPipelineRegistration<>("test-kernel-to-hsaco", + "Convert all kernel functions to ROCm HSACO blobs", + [](OpPassManager &pm) { + pm.addPass(createConvertGPUKernelToHsacoPass( + compileROCDLToHsacoForTesting)); + }); +} +} // namespace mlir +#endif diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -38,6 +38,7 @@ config.run_cuda_tests = @MLIR_CUDA_CONVERSIONS_ENABLED@ config.cuda_wrapper_library_dir = "@MLIR_CUDA_WRAPPER_LIBRARY_DIR@" config.enable_cuda_runner = @MLIR_CUDA_RUNNER_ENABLED@ +config.run_rocm_tests = @MLIR_ROCM_CONVERSIONS_ENABLED@ config.vulkan_wrapper_library_dir = "@MLIR_VULKAN_WRAPPER_LIBRARY_DIR@" config.enable_vulkan_runner = @MLIR_VULKAN_RUNNER_ENABLED@ diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -46,6 +46,7 @@ void registerTestCallGraphPass(); void registerTestConstantFold(); void registerTestConvertGPUKernelToCubinPass(); +void registerTestConvertGPUKernelToHsacoPass(); void registerTestDominancePass(); void registerTestFunc(); void registerTestGpuMemoryPromotionPass(); @@ -111,6 +112,9 @@ registerTestConstantFold(); #if MLIR_CUDA_CONVERSIONS_ENABLED registerTestConvertGPUKernelToCubinPass(); +#endif +#if MLIR_ROCM_CONVERSIONS_ENABLED + registerTestConvertGPUKernelToHsacoPass(); #endif registerTestBufferPlacementPreparationPass(); registerTestDominancePass();