diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt --- a/mlir/CMakeLists.txt +++ b/mlir/CMakeLists.txt @@ -41,6 +41,7 @@ add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ROCM_CONVERSIONS_ENABLED}) set(MLIR_CUDA_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir CUDA runner") +set(MLIR_ROCM_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir ROCm runner") set(MLIR_VULKAN_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir Vulkan runner") option(MLIR_INCLUDE_TESTS diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h --- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h +++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h @@ -43,7 +43,8 @@ /// instead uses a small wrapper library that exports a stable and conveniently /// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP). std::unique_ptr> -createConvertGpuLaunchFuncToGpuRuntimeCallsPass(); +createConvertGpuLaunchFuncToGpuRuntimeCallsPass( + StringRef gpuBinaryAnnotation = ""); /// Creates a pass to convert kernel functions into GPU target object blobs. /// diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp --- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp +++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp @@ -123,6 +123,11 @@ void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp); public: + GpuLaunchFuncToGpuRuntimeCallsPass() = default; + GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) { + this->gpuBinaryAnnotation = gpuBinaryAnnotation.str(); + } + // Run the dialect converter on the module. void runOnOperation() override { // Cache the LLVMDialect for the current module. @@ -457,6 +462,8 @@ } std::unique_ptr> -mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass() { - return std::make_unique(); +mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass( + StringRef gpuBinaryAnnotation) { + return std::make_unique( + gpuBinaryAnnotation); } diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -13,8 +13,9 @@ set(MLIR_RUNNER_UTILS_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) # Passed to lit.site.cfg.py.in to set up the path where to find the libraries -# for the mlir cuda runner tests. +# for the mlir cuda / rocm / vulkan runner tests. set(MLIR_CUDA_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) +set(MLIR_ROCM_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) set(MLIR_VULKAN_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) configure_lit_site_cfg( @@ -64,6 +65,12 @@ ) endif() +if(MLIR_ROCM_RUNNER_ENABLED) + list(APPEND MLIR_TEST_DEPENDS + mlir-rocm-runner + ) +endif() + if(MLIR_VULKAN_RUNNER_ENABLED) list(APPEND MLIR_TEST_DEPENDS mlir-vulkan-runner diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -70,6 +70,7 @@ ToolSubst('%cuda_wrapper_library_dir', config.cuda_wrapper_library_dir, unresolved='ignore'), ToolSubst('%linalg_test_lib_dir', config.linalg_test_lib_dir, unresolved='ignore'), ToolSubst('%mlir_runner_utils_dir', config.mlir_runner_utils_dir, unresolved='ignore'), + ToolSubst('%rocm_wrapper_library_dir', config.rocm_wrapper_library_dir, unresolved='ignore'), ToolSubst('%vulkan_wrapper_library_dir', config.vulkan_wrapper_library_dir, unresolved='ignore') ]) diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -39,6 +39,8 @@ config.cuda_wrapper_library_dir = "@MLIR_CUDA_WRAPPER_LIBRARY_DIR@" config.enable_cuda_runner = @MLIR_CUDA_RUNNER_ENABLED@ config.run_rocm_tests = @MLIR_ROCM_CONVERSIONS_ENABLED@ +config.rocm_wrapper_library_dir = "@MLIR_ROCM_WRAPPER_LIBRARY_DIR@" +config.enable_rocm_runner = @MLIR_ROCM_RUNNER_ENABLED@ config.vulkan_wrapper_library_dir = "@MLIR_VULKAN_WRAPPER_LIBRARY_DIR@" config.enable_vulkan_runner = @MLIR_VULKAN_RUNNER_ENABLED@ diff --git a/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir b/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir @@ -0,0 +1,32 @@ +// RUN: mlir-rocm-runner %s --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s + +func @other_func(%arg0 : f32, %arg1 : memref) { + %cst = constant 1 : index + %cst2 = dim %arg1, 0 : memref + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst) + threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst) { + store %arg0, %arg1[%tx] : memref + gpu.terminator + } + return +} + +// CHECK: [1, 1, 1, 1, 1] +func @main() { + %arg0 = alloc() : memref<5xf32> + %21 = constant 5 : i32 + %22 = memref_cast %arg0 : memref<5xf32> to memref + %cast = memref_cast %22 : memref to memref<*xf32> + call @mgpuMemHostRegisterFloat(%cast) : (memref<*xf32>) -> () + %23 = memref_cast %22 : memref to memref<*xf32> + call @print_memref_f32(%23) : (memref<*xf32>) -> () + %24 = constant 1.0 : f32 + %25 = call @mgpuMemGetDeviceMemRef1dFloat(%22) : (memref) -> (memref) + call @other_func(%24, %25) : (f32, memref) -> () + call @print_memref_f32(%23) : (memref<*xf32>) -> () + return +} + +func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>) +func @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref) -> (memref) +func @print_memref_f32(%ptr : memref<*xf32>) diff --git a/mlir/test/mlir-rocm-runner/lit.local.cfg b/mlir/test/mlir-rocm-runner/lit.local.cfg new file mode 100644 --- /dev/null +++ b/mlir/test/mlir-rocm-runner/lit.local.cfg @@ -0,0 +1,2 @@ +if not config.enable_rocm_runner: + config.unsupported = True diff --git a/mlir/test/mlir-rocm-runner/two-modules.mlir b/mlir/test/mlir-rocm-runner/two-modules.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/mlir-rocm-runner/two-modules.mlir @@ -0,0 +1,30 @@ +// RUN: mlir-rocm-runner %s --print-ir-after-all --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s --dump-input=always + +// CHECK: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] +func @main() { + %arg = alloc() : memref<13xi32> + %dst = memref_cast %arg : memref<13xi32> to memref + %one = constant 1 : index + %sx = dim %dst, 0 : memref + %cast_dst = memref_cast %dst : memref to memref<*xi32> + call @mgpuMemHostRegisterInt32(%cast_dst) : (memref<*xi32>) -> () + %dst_device = call @mgpuMemGetDeviceMemRef1dInt32(%dst) : (memref) -> (memref) + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) + threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) { + %t0 = index_cast %tx : index to i32 + store %t0, %dst_device[%tx] : memref + gpu.terminator + } + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) + threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) { + %t0 = index_cast %tx : index to i32 + store %t0, %dst_device[%tx] : memref + gpu.terminator + } + call @print_memref_i32(%cast_dst) : (memref<*xi32>) -> () + return +} + +func @mgpuMemHostRegisterInt32(%ptr : memref<*xi32>) +func @mgpuMemGetDeviceMemRef1dInt32(%ptr : memref) -> (memref) +func @print_memref_i32(%ptr : memref<*xi32>) diff --git a/mlir/test/mlir-rocm-runner/vecadd.mlir b/mlir/test/mlir-rocm-runner/vecadd.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/mlir-rocm-runner/vecadd.mlir @@ -0,0 +1,42 @@ +// RUN: mlir-rocm-runner %s --shared-libs=%rocm_wrapper_library_dir/librocm-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s + +func @vecadd(%arg0 : memref, %arg1 : memref, %arg2 : memref) { + %cst = constant 1 : index + %cst2 = dim %arg0, 0 : memref + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst) + threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst) { + %a = load %arg0[%tx] : memref + %b = load %arg1[%tx] : memref + %c = addf %a, %b : f32 + store %c, %arg2[%tx] : memref + gpu.terminator + } + return +} + +// CHECK: [2.46, 2.46, 2.46, 2.46, 2.46] +func @main() { + %0 = alloc() : memref<5xf32> + %1 = alloc() : memref<5xf32> + %2 = alloc() : memref<5xf32> + %3 = memref_cast %0 : memref<5xf32> to memref + %4 = memref_cast %1 : memref<5xf32> to memref + %5 = memref_cast %2 : memref<5xf32> to memref + %6 = memref_cast %3 : memref to memref<*xf32> + %7 = memref_cast %4 : memref to memref<*xf32> + %8 = memref_cast %5 : memref to memref<*xf32> + call @mgpuMemHostRegisterFloat(%6) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%7) : (memref<*xf32>) -> () + call @mgpuMemHostRegisterFloat(%8) : (memref<*xf32>) -> () + %9 = call @mgpuMemGetDeviceMemRef1dFloat(%3) : (memref) -> (memref) + %10 = call @mgpuMemGetDeviceMemRef1dFloat(%4) : (memref) -> (memref) + %11 = call @mgpuMemGetDeviceMemRef1dFloat(%5) : (memref) -> (memref) + + call @vecadd(%9, %10, %11) : (memref, memref, memref) -> () + call @print_memref_f32(%8) : (memref<*xf32>) -> () + return +} + +func @mgpuMemHostRegisterFloat(%ptr : memref<*xf32>) +func @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref) -> (memref) +func @print_memref_f32(%ptr : memref<*xf32>) diff --git a/mlir/tools/CMakeLists.txt b/mlir/tools/CMakeLists.txt --- a/mlir/tools/CMakeLists.txt +++ b/mlir/tools/CMakeLists.txt @@ -2,6 +2,7 @@ add_subdirectory(mlir-cpu-runner) add_subdirectory(mlir-linalg-ods-gen) add_subdirectory(mlir-opt) +add_subdirectory(mlir-rocm-runner) add_subdirectory(mlir-translate) add_subdirectory(mlir-vulkan-runner) add_subdirectory(mlir-shlib) diff --git a/mlir/tools/mlir-rocm-runner/CMakeLists.txt b/mlir/tools/mlir-rocm-runner/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/tools/mlir-rocm-runner/CMakeLists.txt @@ -0,0 +1,113 @@ +set(LLVM_OPTIONAL_SOURCES + rocm-runtime-wrappers.cpp + mlir-rocm-runner.cpp + ) + +if(MLIR_ROCM_RUNNER_ENABLED) + if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)) + message(SEND_ERROR + "Building the mlir rocm runner requires the AMDGPU backend") + endif() + + # Ensure lld is enabled. + if (NOT "lld" IN_LIST LLVM_ENABLE_PROJECTS) + message(SEND_ERROR "lld is not enabled. Please revise LLVM_ENABLE_PROJECTS") + endif() + + # lld header files. + include_directories(${MLIR_SOURCE_DIR}/../lld/include) + + # Configure ROCm support. + if (NOT DEFINED ROCM_PATH) + if (NOT DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed") + else() + set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed") + endif() + set(HIP_PATH "${ROCM_PATH}/hip" CACHE PATH " Path to which HIP has been installed") + endif() + set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) + find_package(HIP) + if (NOT HIP_FOUND) + message(SEND_ERROR "Build the mlir rocm runner requires a working ROCm and HIP install") + else() + message(STATUS "ROCm HIP version: ${HIP_VERSION}") + endif() + + # Locate HIP runtime library. + find_library(ROCM_RUNTIME_LIBRARY hip_hcc + PATHS "${HIP_PATH}/lib") + if (NOT ROCM_RUNTIME_LIBRARY) + message(SEND_ERROR "Could not locate ROCm HIP runtime library") + else() + message(STATUS "ROCm HIP runtime lib: ${ROCM_RUNTIME_LIBRARY}") + endif() + + # Set HIP compile-time flags. + add_definitions(-D__HIP_PLATFORM_HCC__) + + add_llvm_library(rocm-runtime-wrappers SHARED + rocm-runtime-wrappers.cpp + ) + target_include_directories(rocm-runtime-wrappers + PRIVATE + "${HIP_PATH}/../include" + "${HIP_PATH}/include" + LLVMSupport + ) + target_link_libraries(rocm-runtime-wrappers + PUBLIC + LLVMSupport + ${ROCM_RUNTIME_LIBRARY} + ) + + get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) + get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) + set(LIBS + ${dialect_libs} + ${conversion_libs} + lldCommon + lldDriver + lldELF + LLVMCore + LLVMLTO + LLVMMC + LLVMMCParser + LLVMOption + LLVMSupport + MLIRJitRunner + MLIRAnalysis + MLIREDSC + MLIRExecutionEngine + MLIRIR + MLIRParser + MLIRROCDLIR + MLIRSupport + MLIRTargetLLVMIR + MLIRTargetROCDLIR + MLIRTransforms + MLIRTranslation + ${ROCM_RUNTIME_LIBRARY} + ) + + # Manually expand the target library, since our MLIR libraries + # aren't plugged into the LLVM dependency tracking. If we don't + # do this then we can't insert the CodeGen library after ourselves + llvm_expand_pseudo_components(TARGET_LIBS AllTargetsCodeGens AllTargetsAsmParsers) + # Prepend LLVM in front of every target, this is how the library + # are named with CMake + SET(targets_to_link) + FOREACH(t ${TARGET_LIBS}) + LIST(APPEND targets_to_link "LLVM${t}") + ENDFOREACH(t) + + add_llvm_tool(mlir-rocm-runner + mlir-rocm-runner.cpp + + DEPENDS + rocm-runtime-wrappers + ) + llvm_update_compile_flags(mlir-rocm-runner) + target_link_libraries(mlir-rocm-runner PRIVATE ${LIBS} ${targets_to_link}) + +endif() diff --git a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp new file mode 100644 --- /dev/null +++ b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp @@ -0,0 +1,248 @@ +//===- mlir-rocm-runner.cpp - MLIR ROCM Execution Driver-------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is a command line utility that executes an MLIR file on the GPU by +// translating MLIR to ROCDL/LLVM IR before JIT-compiling and executing the +// latter. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/STLExtras.h" + +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" +#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" +#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" +#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" +#include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/GPU/Passes.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" +#include "mlir/ExecutionEngine/JitRunner.h" +#include "mlir/ExecutionEngine/OptUtils.h" +#include "mlir/IR/Function.h" +#include "mlir/IR/Module.h" +#include "mlir/InitAllDialects.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Support/FileUtilities.h" +#include "mlir/Target/ROCDLIR.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/Passes.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" + +// MC headers. +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCParser/AsmLexer.h" +#include "llvm/MC/MCParser/MCTargetAsmParser.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetOptionsCommandFlags.h" + +// lld headers. +#include "lld/Common/Driver.h" + +using namespace mlir; +using namespace llvm; + +using Blob = SmallVector; + +static cl::opt tripleName("triple", cl::desc("target triple"), + cl::value_desc("triple string"), + cl::init("amdgcn-amd-amdhsa")); + +// TODO(whchung): Add feature to automatically detect available AMD GCN ISA +// version via `rocm-agent-enumerator` utility. +static cl::opt targetChip("target", cl::desc("target chip"), + cl::value_desc("AMDGPU ISA version"), + cl::init("gfx900")); + +static cl::opt features("feature", cl::desc("target features"), + cl::value_desc("AMDGPU target features"), + cl::init("-code-object-v3")); + +static LogicalResult assembleIsa(const std::string isa, StringRef name, + Blob &result) { + raw_svector_ostream os(result); + + std::string error; + Triple theTriple(Triple::normalize(tripleName)); + const Target *theTarget = + TargetRegistry::lookupTarget(theTriple.normalize(), error); + if (!theTarget) { + WithColor::error(errs(), name) << error; + return failure(); + } + + SourceMgr srcMgr; + srcMgr.AddNewSourceBuffer(MemoryBuffer::getMemBuffer(isa), SMLoc()); + + const MCTargetOptions mcOptions; + std::unique_ptr mri(theTarget->createMCRegInfo(tripleName)); + std::unique_ptr mai( + theTarget->createMCAsmInfo(*mri, tripleName, mcOptions)); + mai->setRelaxELFRelocations(true); + + MCObjectFileInfo mofi; + MCContext ctx(mai.get(), mri.get(), &mofi, &srcMgr, &mcOptions); + mofi.InitMCObjectFileInfo(theTriple, false, ctx, false); + + SmallString<128> cwd; + if (!sys::fs::current_path(cwd)) + ctx.setCompilationDir(cwd); + + std::unique_ptr mcStreamer; + std::unique_ptr mcii(theTarget->createMCInstrInfo()); + std::unique_ptr sti( + theTarget->createMCSubtargetInfo(tripleName, targetChip, features)); + + MCCodeEmitter *ce = theTarget->createMCCodeEmitter(*mcii, *mri, ctx); + MCAsmBackend *mab = theTarget->createMCAsmBackend(*sti, *mri, mcOptions); + mcStreamer.reset(theTarget->createMCObjectStreamer( + theTriple, ctx, std::unique_ptr(mab), + mab->createObjectWriter(os), std::unique_ptr(ce), *sti, + mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible, + /*DWARFMustBeAtTheEnd*/ false)); + mcStreamer->setUseAssemblerInfoForParsing(true); + + std::unique_ptr parser( + createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai)); + std::unique_ptr tap( + theTarget->createMCAsmParser(*sti, *parser, *mcii, mcOptions)); + + if (!tap) { + WithColor::error(errs(), name) << "assembler initialization error.\n"; + return failure(); + } + + parser->setTargetParser(*tap); + parser->Run(false); + + return success(); +} + +static LogicalResult createHsaco(const Blob &isaBlob, StringRef name, + Blob &hsacoBlob) { + // Save the ISA binary to a temp file. + int tempIsaBinaryFd = -1; + SmallString<128> tempIsaBinaryFilename; + std::error_code ec = sys::fs::createTemporaryFile( + "kernel", "o", tempIsaBinaryFd, tempIsaBinaryFilename); + if (ec) { + WithColor::error(errs(), name) + << "temporary file for ISA binary creation error.\n"; + return failure(); + } + FileRemover cleanupIsaBinary(tempIsaBinaryFilename); + raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true); + tempIsaBinaryOs << isaBlob; + tempIsaBinaryOs.close(); + + // Create a temp file for HSA code object. + int tempHsacoFD = -1; + SmallString<128> tempHsacoFilename; + ec = sys::fs::createTemporaryFile("kernel", "hsaco", tempHsacoFD, + tempHsacoFilename); + if (ec) { + WithColor::error(errs(), name) + << "temporary file for HSA code object creation error.\n"; + return failure(); + } + FileRemover cleanupHsaco(tempHsacoFilename); + + // Invoke lld. Expect a true return value from lld. + bool ret = lld::elf::link({"ld.lld", "-shared", tempIsaBinaryFilename.c_str(), + "-o", tempHsacoFilename.c_str()}, + /*canEarlyExit=*/false, llvm::outs(), llvm::errs()); + if (!ret) { + WithColor::error(errs(), name) << "lld invocation error.\n"; + return failure(); + } + + // Load the HSA code object. + auto hsacoFile = mlir::openInputFile(tempHsacoFilename); + if (!hsacoFile) { + WithColor::error(errs(), name) + << "read HSA code object from temp file error.\n"; + return failure(); + } + hsacoBlob.assign(hsacoFile->getBuffer().begin(), + hsacoFile->getBuffer().end()); + + return success(); +} + +static std::unique_ptr compileModuleToROCDLIR(Operation *m) { + auto llvmModule = translateModuleToROCDLIR(m); + // TODO(whchung): Link with ROCm-Device-Libs in case needed (ex: the Module + // depends on math functions). + return llvmModule; +} + +static OwnedBlob compileISAToHsaco(const std::string isa, Location loc, + StringRef name) { + // ISA -> ISA in binary form via MC. + // Use lld to create HSA code object. + Blob isaBlob; + Blob hsacoBlob; + + if (succeeded(assembleIsa(isa, name, isaBlob)) && + succeeded(createHsaco(isaBlob, name, hsacoBlob))) + return std::make_unique>(hsacoBlob.begin(), + hsacoBlob.end()); + + WithColor::error(errs(), name) << "producing HSA code object error.\n"; + return {}; +} + +static LogicalResult runMLIRPasses(ModuleOp m) { + PassManager pm(m.getContext()); + applyPassManagerCLOptions(pm); + + pm.addPass(createGpuKernelOutliningPass()); + auto &kernelPm = pm.nest(); + kernelPm.addPass(createStripDebugInfoPass()); + kernelPm.addPass(createLowerGpuOpsToROCDLOpsPass()); + kernelPm.addPass(createConvertGPUKernelToBlobPass( + compileModuleToROCDLIR, compileISAToHsaco, tripleName, targetChip, + features, /*gpuBinaryAnnotation=*/"rocdl.hsaco")); + pm.addPass(createLowerToLLVMPass()); + pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass( + /*gpuBinaryAnnotation=*/"rocdl.hsaco")); + + return pm.run(m); +} + +int main(int argc, char **argv) { + registerPassManagerCLOptions(); + mlir::registerAllDialects(); + llvm::InitLLVM y(argc, argv); + llvm::InitializeAllTargetInfos(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllAsmParsers(); + + // Initialize LLVM AMDGPU backend. + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUAsmPrinter(); + + mlir::initializeLLVMPasses(); + return mlir::JitRunnerMain(argc, argv, &runMLIRPasses); +} diff --git a/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp b/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp new file mode 100644 --- /dev/null +++ b/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp @@ -0,0 +1,143 @@ +//===- rocm-runtime-wrappers.cpp - MLIR ROCM runner wrapper library -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implements C wrappers around the ROCM library for easy linking in ORC jit. +// Also adds some debugging helpers that are helpful when writing MLIR code to +// run on GPUs. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "mlir/ExecutionEngine/CRunnerUtils.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/raw_ostream.h" + +#include "hip/hip_runtime.h" + +namespace { +int32_t reportErrorIfAny(hipError_t result, const char *where) { + if (result != hipSuccess) { + llvm::errs() << "HIP failed with " << result << " in " << where << "\n"; + } + return result; +} +} // anonymous namespace + +extern "C" int32_t mgpuModuleLoad(void **module, void *data) { + int32_t err = reportErrorIfAny( + hipModuleLoadData(reinterpret_cast(module), data), + "ModuleLoad"); + return err; +} + +extern "C" int32_t mgpuModuleGetFunction(void **function, void *module, + const char *name) { + return reportErrorIfAny( + hipModuleGetFunction(reinterpret_cast(function), + reinterpret_cast(module), name), + "GetFunction"); +} + +// The wrapper uses intptr_t instead of ROCM's unsigned int to match +// the type of MLIR's index type. This avoids the need for casts in the +// generated MLIR code. +extern "C" int32_t mgpuLaunchKernel(void *function, intptr_t gridX, + intptr_t gridY, intptr_t gridZ, + intptr_t blockX, intptr_t blockY, + intptr_t blockZ, int32_t smem, void *stream, + void **params, void **extra) { + return reportErrorIfAny( + hipModuleLaunchKernel(reinterpret_cast(function), gridX, + gridY, gridZ, blockX, blockY, blockZ, smem, + reinterpret_cast(stream), params, + extra), + "LaunchKernel"); +} + +extern "C" void *mgpuGetStreamHelper() { + hipStream_t stream; + reportErrorIfAny(hipStreamCreate(&stream), "StreamCreate"); + return stream; +} + +extern "C" int32_t mgpuStreamSynchronize(void *stream) { + return reportErrorIfAny( + hipStreamSynchronize(reinterpret_cast(stream)), + "StreamSync"); +} + +/// Helper functions for writing mlir example code + +// Allows to register byte array with the ROCM runtime. Helpful until we have +// transfer functions implemented. +extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) { + reportErrorIfAny(hipHostRegister(ptr, sizeBytes, /*flags=*/0), + "MemHostRegister"); +} + +// Allows to register a MemRef with the ROCM runtime. Initializes array with +// value. Helpful until we have transfer functions implemented. +template +void mgpuMemHostRegisterMemRef(T *pointer, llvm::ArrayRef sizes, + llvm::ArrayRef strides, T value) { + assert(sizes.size() == strides.size()); + llvm::SmallVector denseStrides(strides.size()); + + std::partial_sum(sizes.rbegin(), sizes.rend(), denseStrides.rbegin(), + std::multiplies()); + auto count = denseStrides.front(); + + // Only densely packed tensors are currently supported. + std::rotate(denseStrides.begin(), denseStrides.begin() + 1, + denseStrides.end()); + denseStrides.back() = 1; + assert(strides == llvm::makeArrayRef(denseStrides)); + + std::fill_n(pointer, count, value); + mgpuMemHostRegister(pointer, count * sizeof(T)); +} + +extern "C" void mgpuMemHostRegisterFloat(int64_t rank, void *ptr) { + auto *desc = static_cast *>(ptr); + auto sizes = llvm::ArrayRef(desc->sizes, rank); + auto strides = llvm::ArrayRef(desc->sizes + rank, rank); + mgpuMemHostRegisterMemRef(desc->data + desc->offset, sizes, strides, 1.23f); +} + +extern "C" void mgpuMemHostRegisterInt32(int64_t rank, void *ptr) { + auto *desc = static_cast *>(ptr); + auto sizes = llvm::ArrayRef(desc->sizes, rank); + auto strides = llvm::ArrayRef(desc->sizes + rank, rank); + mgpuMemHostRegisterMemRef(desc->data + desc->offset, sizes, strides, 123); +} + +template +void mgpuMemGetDevicePointer(T *hostPtr, T **devicePtr) { + reportErrorIfAny(hipSetDevice(0), "hipSetDevice"); + reportErrorIfAny( + hipHostGetDevicePointer((void **)devicePtr, hostPtr, /*flags=*/0), + "hipHostGetDevicePointer"); +} + +extern "C" StridedMemRefType +mgpuMemGetDeviceMemRef1dFloat(float *allocated, float *aligned, int64_t offset, + int64_t size, int64_t stride) { + float *devicePtr = nullptr; + mgpuMemGetDevicePointer(aligned, &devicePtr); + return {devicePtr, devicePtr, offset, {size}, {stride}}; +} + +extern "C" StridedMemRefType +mgpuMemGetDeviceMemRef1dInt32(int32_t *allocated, int32_t *aligned, + int64_t offset, int64_t size, int64_t stride) { + int32_t *devicePtr = nullptr; + mgpuMemGetDevicePointer(aligned, &devicePtr); + return {devicePtr, devicePtr, offset, {size}, {stride}}; +}