diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt --- a/mlir/CMakeLists.txt +++ b/mlir/CMakeLists.txt @@ -31,6 +31,15 @@ # TODO: we should use a config.h file like LLVM does add_definitions(-DMLIR_CUDA_CONVERSIONS_ENABLED=${MLIR_CUDA_CONVERSIONS_ENABLED}) +# Build the ROCm conversions and run according tests if the AMDGPU backend +# is available +if ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD) + set(MLIR_ROCM_CONVERSIONS_ENABLED 1) +else() + set(MLIR_ROCM_CONVERSIONS_ENABLED 0) +endif() +add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ROCM_CONVERSIONS_ENABLED}) + set(MLIR_CUDA_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir CUDA runner") set(MLIR_VULKAN_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir Vulkan runner") diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h --- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h +++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h @@ -9,6 +9,7 @@ #define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ #include "mlir/Support/LLVM.h" +#include "llvm/IR/Module.h" #include #include #include @@ -17,11 +18,27 @@ namespace mlir { class Location; +class LogicalResult; class ModuleOp; +class Operation; template class OperationPass; +namespace gpu { +class GPUModuleOp; +} // namespace gpu + +namespace LLVM { +class LLVMDialect; +} // namespace LLVM + +using OwnedBlob = std::unique_ptr>; +using BlobGenerator = + std::function; +using LoweringCallback = + std::function(Operation *)>; + /// Creates a pass to convert a gpu.launch_func operation into a sequence of /// GPU runtime calls. /// @@ -31,6 +48,34 @@ std::unique_ptr> createConvertGpuLaunchFuncToGpuRuntimeCallsPass(); +/// Creates a pass to convert kernel functions into GPU target object blobs. +/// +/// This transformation takes the body of each function that is annotated with +/// the 'gpu.kernel' attribute, copies it to a new LLVM module, compiles the +/// module with help of the GPU backend to target object and then invokes +/// the provided blobGenerator to produce a binary blob. Such blob is then +/// attached as a string attribute to the kernel function. +/// +/// Following callbacks are to be provided by user: +/// - loweringCallback : lower the module to an LLVM module. +/// - blobGenerator : build a blob executable on target GPU. +/// +/// Information wrt LLVM backend are to be supplied by user: +/// - triple : target triple to be used. +/// - targetChip : mcpu to be used. +/// - features : target-specific features to be used. +/// +/// Information about result attribute is to be specified by user: +/// - gpuBinaryAnnotation : the name of the attribute which contains the blob. +/// +/// After the transformation, the body of the kernel function is removed (i.e., +/// it is turned into a declaration). +std::unique_ptr> +createConvertGPUKernelToBlobPass(LoweringCallback loweringCallback, + BlobGenerator blobGenerator, StringRef triple, + StringRef targetChip, StringRef features, + StringRef gpuBinaryAnnotation); + } // namespace mlir #endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ diff --git a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h deleted file mode 100644 --- a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h +++ /dev/null @@ -1,50 +0,0 @@ -//===- GPUToCUDAPass.h - MLIR CUDA runtime support --------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_ -#define MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_ - -#include "mlir/Support/LLVM.h" -#include -#include -#include -#include - -namespace mlir { - -class Location; -class ModuleOp; - -template class OperationPass; - -namespace gpu { -class GPUModuleOp; -} // namespace gpu - -namespace LLVM { -class LLVMDialect; -} // namespace LLVM - -using OwnedCubin = std::unique_ptr>; -using CubinGenerator = - std::function; - -/// Creates a pass to convert kernel functions into CUBIN blobs. -/// -/// This transformation takes the body of each function that is annotated with -/// the 'nvvm.kernel' attribute, copies it to a new LLVM module, compiles the -/// module with help of the nvptx backend to PTX and then invokes the provided -/// cubinGenerator to produce a binary blob (the cubin). Such blob is then -/// attached as a string attribute named 'nvvm.cubin' to the kernel function. -/// After the transformation, the body of the kernel function is removed (i.e., -/// it is turned into a declaration). -std::unique_ptr> -createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator); - -} // namespace mlir - -#endif // MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_ diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h --- a/mlir/include/mlir/InitAllPasses.h +++ b/mlir/include/mlir/InitAllPasses.h @@ -16,7 +16,6 @@ #include "mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h" #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" -#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" #include "mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h" diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -1,7 +1,6 @@ add_subdirectory(AffineToStandard) add_subdirectory(AVX512ToLLVM) add_subdirectory(GPUCommon) -add_subdirectory(GPUToCUDA) add_subdirectory(GPUToNVVM) add_subdirectory(GPUToROCDL) add_subdirectory(GPUToSPIRV) diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt --- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt @@ -1,9 +1,6 @@ -set(SOURCES - ConvertLaunchFuncToRuntimeCalls.cpp -) - add_mlir_conversion_library(MLIRGPUtoGPURuntimeTransforms - ${SOURCES} + ConvertLaunchFuncToRuntimeCalls.cpp + ConvertKernelFuncToBlob.cpp DEPENDS MLIRConversionPassIncGen diff --git a/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp b/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp @@ -0,0 +1,168 @@ +//===- ConvertKernelFuncToBlob.cpp - MLIR GPU lowering passes -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to convert gpu kernel functions into a +// corresponding binary blob that can be executed on a GPU. Currently +// only translates the function itself but no dependencies. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" + +#include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Function.h" +#include "mlir/IR/Module.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassRegistry.h" +#include "mlir/Support/LogicalResult.h" + +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/Twine.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/Mutex.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" + +using namespace mlir; + +namespace { + +/// A pass converting tagged kernel modules to a blob with target instructions. +/// +/// If tagged as a kernel module, each contained function is translated to +/// user-specified IR. A user provided BlobGenerator then compiles the IR to +/// GPU binary code, which is then attached as an attribute to the function. +/// The function body is erased. +class GpuKernelToBlobPass + : public PassWrapper> { +public: + GpuKernelToBlobPass(LoweringCallback loweringCallback, + BlobGenerator blobGenerator, StringRef triple, + StringRef targetChip, StringRef features, + StringRef gpuBinaryAnnotation) + : loweringCallback(loweringCallback), blobGenerator(blobGenerator), + triple(triple), targetChip(targetChip), features(features), + blobAnnotation(gpuBinaryAnnotation) {} + + void runOnOperation() override { + gpu::GPUModuleOp module = getOperation(); + + // Lock access to the llvm context. + llvm::sys::SmartScopedLock scopedLock( + module.getContext() + ->getRegisteredDialect() + ->getLLVMContextMutex()); + + // Lower the module to a llvm module. + std::unique_ptr llvmModule = loweringCallback(module); + if (!llvmModule) + return signalPassFailure(); + + // Translate the llvm module to a target blob and attach the result as + // attribute to the module. + if (auto blobAttr = translateGPUModuleToBinaryAnnotation( + *llvmModule, module.getLoc(), module.getName())) + module.setAttr(blobAnnotation, blobAttr); + else + signalPassFailure(); + } + +private: + std::string translateModuleToISA(llvm::Module &module, + llvm::TargetMachine &targetMachine); + + /// Converts llvmModule to a blob with target instructions using the + /// user-provided generator. Location is used for error reporting and name is + /// forwarded to the blob generator to use in its logging mechanisms. + OwnedBlob convertModuleToBlob(llvm::Module &llvmModule, Location loc, + StringRef name); + + /// Translates llvmModule to a blob with target instructions and returns the + /// result as attribute. + StringAttr translateGPUModuleToBinaryAnnotation(llvm::Module &llvmModule, + Location loc, StringRef name); + + LoweringCallback loweringCallback; + BlobGenerator blobGenerator; + llvm::Triple triple; + StringRef targetChip; + StringRef features; + StringRef blobAnnotation; +}; + +} // anonymous namespace + +std::string +GpuKernelToBlobPass::translateModuleToISA(llvm::Module &module, + llvm::TargetMachine &targetMachine) { + std::string targetISA; + { + // Clone the llvm module into a new context to enable concurrent compilation + // with multiple threads. + llvm::LLVMContext llvmContext; + auto clone = LLVM::cloneModuleIntoNewContext(&llvmContext, &module); + + llvm::raw_string_ostream stream(targetISA); + llvm::buffer_ostream pstream(stream); + llvm::legacy::PassManager codegenPasses; + targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr, + llvm::CGFT_AssemblyFile); + codegenPasses.run(*clone); + } + + return targetISA; +} + +OwnedBlob GpuKernelToBlobPass::convertModuleToBlob(llvm::Module &llvmModule, + Location loc, + StringRef name) { + std::unique_ptr targetMachine; + { + std::string error; + const llvm::Target *target = + llvm::TargetRegistry::lookupTarget("", triple, error); + if (target == nullptr) { + emitError(loc, "cannot initialize target triple"); + return {}; + } + targetMachine.reset(target->createTargetMachine(triple.str(), targetChip, + features, {}, {})); + } + + llvmModule.setDataLayout(targetMachine->createDataLayout()); + + auto targetISA = translateModuleToISA(llvmModule, *targetMachine); + + return blobGenerator(targetISA, loc, name); +} + +StringAttr GpuKernelToBlobPass::translateGPUModuleToBinaryAnnotation( + llvm::Module &llvmModule, Location loc, StringRef name) { + auto blob = convertModuleToBlob(llvmModule, loc, name); + if (!blob) + return {}; + return StringAttr::get({blob->data(), blob->size()}, loc->getContext()); +} + +std::unique_ptr> +mlir::createConvertGPUKernelToBlobPass(LoweringCallback loweringCallback, + BlobGenerator blobGenerator, + StringRef triple, StringRef targetChip, + StringRef features, + StringRef gpuBinaryAnnotation) { + return std::make_unique(loweringCallback, blobGenerator, + triple, targetChip, features, + gpuBinaryAnnotation); +} diff --git a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt deleted file mode 100644 --- a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt +++ /dev/null @@ -1,35 +0,0 @@ -set(LLVM_OPTIONAL_SOURCES - ConvertKernelFuncToCubin.cpp -) - -if (MLIR_CUDA_CONVERSIONS_ENABLED) - set(NVPTX_LIBS - MC - NVPTXCodeGen - NVPTXDesc - NVPTXInfo - ) - - add_mlir_conversion_library(MLIRGPUtoCUDATransforms - ConvertKernelFuncToCubin.cpp - - DEPENDS - MLIRConversionPassIncGen - intrinsics_gen - - LINK_COMPONENTS - Core - ${NVPTX_LIBS} - - LINK_LIBS PUBLIC - MLIRGPU - MLIRIR - MLIRLLVMIR - MLIRNVVMIR - MLIRPass - MLIRSupport - MLIRTargetNVVMIR - ) -else() - add_library(MLIRGPUtoCUDATransforms INTERFACE IMPORTED GLOBAL) -endif() diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp deleted file mode 100644 --- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp +++ /dev/null @@ -1,165 +0,0 @@ -//===- ConvertKernelFuncToCubin.cpp - MLIR GPU lowering passes ------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements a pass to convert gpu kernel functions into a -// corresponding binary blob that can be executed on a CUDA GPU. Currently -// only translates the function itself but no dependencies. -// -//===----------------------------------------------------------------------===// - -#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" - -#include "mlir/Dialect/GPU/GPUDialect.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/IR/Attributes.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/Function.h" -#include "mlir/IR/Module.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Pass/PassRegistry.h" -#include "mlir/Support/LogicalResult.h" -#include "mlir/Target/NVVMIR.h" - -#include "llvm/ADT/Optional.h" -#include "llvm/ADT/Twine.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/Error.h" -#include "llvm/Support/Mutex.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Target/TargetMachine.h" - -using namespace mlir; - -namespace { -// TODO(herhut): Move to shared location. -static constexpr const char *kCubinAnnotation = "nvvm.cubin"; - -/// A pass converting tagged kernel modules to cubin blobs. -/// -/// If tagged as a kernel module, each contained function is translated to NVVM -/// IR and further to PTX. A user provided CubinGenerator compiles the PTX to -/// GPU binary code, which is then attached as an attribute to the function. The -/// function body is erased. -class GpuKernelToCubinPass - : public PassWrapper> { -public: - GpuKernelToCubinPass(CubinGenerator cubinGenerator) - : cubinGenerator(cubinGenerator) {} - - void runOnOperation() override { - gpu::GPUModuleOp module = getOperation(); - - // Lock access to the llvm context. - llvm::sys::SmartScopedLock scopedLock( - module.getContext() - ->getRegisteredDialect() - ->getLLVMContextMutex()); - - // Make sure the NVPTX target is initialized. - LLVMInitializeNVPTXTarget(); - LLVMInitializeNVPTXTargetInfo(); - LLVMInitializeNVPTXTargetMC(); - LLVMInitializeNVPTXAsmPrinter(); - - auto llvmModule = translateModuleToNVVMIR(module); - if (!llvmModule) - return signalPassFailure(); - - // Translate the module to CUBIN and attach the result as attribute to the - // module. - if (auto cubinAttr = translateGPUModuleToCubinAnnotation( - *llvmModule, module.getLoc(), module.getName())) - module.setAttr(kCubinAnnotation, cubinAttr); - else - signalPassFailure(); - } - -private: - std::string translateModuleToPtx(llvm::Module &module, - llvm::TargetMachine &target_machine); - - /// Converts llvmModule to cubin using the user-provided generator. Location - /// is used for error reporting and name is forwarded to the CUBIN generator - /// to use in its logging mechanisms. - OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, Location loc, - StringRef name); - - /// Translates llvmModule to cubin and returns the result as attribute. - StringAttr translateGPUModuleToCubinAnnotation(llvm::Module &llvmModule, - Location loc, StringRef name); - - CubinGenerator cubinGenerator; -}; - -} // anonymous namespace - -std::string GpuKernelToCubinPass::translateModuleToPtx( - llvm::Module &module, llvm::TargetMachine &target_machine) { - std::string ptx; - { - // Clone the llvm module into a new context to enable concurrent compilation - // with multiple threads. - // TODO(zinenko): Reevaluate model of ownership of LLVMContext in - // LLVMDialect. - llvm::LLVMContext llvmContext; - auto clone = LLVM::cloneModuleIntoNewContext(&llvmContext, &module); - - llvm::raw_string_ostream stream(ptx); - llvm::buffer_ostream pstream(stream); - llvm::legacy::PassManager codegen_passes; - target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr, - llvm::CGFT_AssemblyFile); - codegen_passes.run(*clone); - } - - return ptx; -} - -OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule, - Location loc, - StringRef name) { - std::unique_ptr targetMachine; - { - std::string error; - // TODO(herhut): Make triple configurable. - constexpr const char *cudaTriple = "nvptx64-nvidia-cuda"; - llvm::Triple triple(cudaTriple); - const llvm::Target *target = - llvm::TargetRegistry::lookupTarget("", triple, error); - if (target == nullptr) { - emitError(loc, "cannot initialize target triple"); - return {}; - } - targetMachine.reset( - target->createTargetMachine(triple.str(), "sm_35", "+ptx60", {}, {})); - } - - // Set the data layout of the llvm module to match what the ptx target needs. - llvmModule.setDataLayout(targetMachine->createDataLayout()); - - auto ptx = translateModuleToPtx(llvmModule, *targetMachine); - - return cubinGenerator(ptx, loc, name); -} - -StringAttr GpuKernelToCubinPass::translateGPUModuleToCubinAnnotation( - llvm::Module &llvmModule, Location loc, StringRef name) { - auto cubin = convertModuleToCubin(llvmModule, loc, name); - if (!cubin) - return {}; - return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext()); -} - -std::unique_ptr> -mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) { - return std::make_unique(cubinGenerator); -} diff --git a/mlir/test/Conversion/GPUToROCm/lit.local.cfg b/mlir/test/Conversion/GPUToROCm/lit.local.cfg new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUToROCm/lit.local.cfg @@ -0,0 +1,2 @@ +if not config.run_rocm_tests: + config.unsupported = True diff --git a/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir @@ -0,0 +1,26 @@ +// RUN: mlir-opt %s --test-kernel-to-hsaco -split-input-file | FileCheck %s + +// CHECK: attributes {rocdl.hsaco = "HSACO"} +gpu.module @foo { + llvm.func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">) + // CHECK: attributes {gpu.kernel} + attributes { gpu.kernel } { + llvm.return + } +} + +// ----- + +gpu.module @bar { + // CHECK: func @kernel_a + llvm.func @kernel_a() + attributes { gpu.kernel } { + llvm.return + } + + // CHECK: func @kernel_b + llvm.func @kernel_b() + attributes { gpu.kernel } { + llvm.return + } +} diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -1,3 +1,21 @@ +if (MLIR_CUDA_CONVERSIONS_ENABLED) + set(NVPTX_LIBS + MC + NVPTXCodeGen + NVPTXDesc + NVPTXInfo + ) +endif() + +if (MLIR_ROCM_CONVERSIONS_ENABLED) + set(AMDGPU_LIBS + MC + AMDGPUCodeGen + AMDGPUDesc + AMDGPUInfo + ) +endif() + # Exclude tests from libMLIR.so add_mlir_library(MLIRTestTransforms TestAllReduceLowering.cpp @@ -5,6 +23,7 @@ TestCallGraph.cpp TestConstantFold.cpp TestConvertGPUKernelToCubin.cpp + TestConvertGPUKernelToHsaco.cpp TestDominance.cpp TestLoopFusion.cpp TestGpuMemoryPromotion.cpp @@ -31,18 +50,26 @@ MLIRStandardOpsIncGen MLIRTestVectorTransformPatternsIncGen + LINK_COMPONENTS + ${AMDGPU_LIBS} + ${NVPTX_LIBS} + LINK_LIBS PUBLIC MLIRAffineOps MLIRAnalysis MLIREDSC MLIRGPU - MLIRGPUtoCUDATransforms + MLIRGPUtoGPURuntimeTransforms MLIRLinalgOps MLIRLinalgTransforms + MLIRNVVMIR MLIRSCF MLIRGPU MLIRPass + MLIRROCDLIR MLIRStandardOpsTransforms + MLIRTargetNVVMIR + MLIRTargetROCDLIR MLIRTestDialect MLIRTransformUtils MLIRVectorToSCF diff --git a/mlir/test/lib/Transforms/TestConvertGPUKernelToCubin.cpp b/mlir/test/lib/Transforms/TestConvertGPUKernelToCubin.cpp --- a/mlir/test/lib/Transforms/TestConvertGPUKernelToCubin.cpp +++ b/mlir/test/lib/Transforms/TestConvertGPUKernelToCubin.cpp @@ -6,26 +6,36 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" +#include "mlir/Target/NVVMIR.h" +#include "llvm/Support/TargetSelect.h" using namespace mlir; #if MLIR_CUDA_CONVERSIONS_ENABLED -static OwnedCubin compilePtxToCubinForTesting(const std::string &, Location, - StringRef) { +static OwnedBlob compilePtxToCubinForTesting(const std::string &, Location, + StringRef) { const char data[] = "CUBIN"; return std::make_unique>(data, data + sizeof(data) - 1); } namespace mlir { void registerTestConvertGPUKernelToCubinPass() { - PassPipelineRegistration<>("test-kernel-to-cubin", - "Convert all kernel functions to CUDA cubin blobs", - [](OpPassManager &pm) { - pm.addPass(createConvertGPUKernelToCubinPass( - compilePtxToCubinForTesting)); - }); + PassPipelineRegistration<>( + "test-kernel-to-cubin", + "Convert all kernel functions to CUDA cubin blobs", + [](OpPassManager &pm) { + // Initialize LLVM NVPTX backend. + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTargetMC(); + LLVMInitializeNVPTXAsmPrinter(); + + pm.addPass(createConvertGPUKernelToBlobPass( + translateModuleToNVVMIR, compilePtxToCubinForTesting, + "nvptx64-nvidia-cuda", "sm_35", "+ptx60", "nvvm.cubin")); + }); } } // namespace mlir #endif diff --git a/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp b/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp new file mode 100644 --- /dev/null +++ b/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp @@ -0,0 +1,41 @@ +//===- TestConvertGPUKernelToHsaco.cpp - Test gpu kernel hsaco lowering ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Target/ROCDLIR.h" +#include "llvm/Support/TargetSelect.h" +using namespace mlir; + +#if MLIR_ROCM_CONVERSIONS_ENABLED +static OwnedBlob compileIsaToHsacoForTesting(const std::string &, Location, + StringRef) { + const char data[] = "HSACO"; + return std::make_unique>(data, data + sizeof(data) - 1); +} + +namespace mlir { +void registerTestConvertGPUKernelToHsacoPass() { + PassPipelineRegistration<>( + "test-kernel-to-hsaco", + "Convert all kernel functions to ROCm hsaco blobs", + [](OpPassManager &pm) { + // Initialize LLVM AMDGPU backend. + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUAsmPrinter(); + + pm.addPass(createConvertGPUKernelToBlobPass( + translateModuleToROCDLIR, compileIsaToHsacoForTesting, + "amdgcn-amd-amdhsa", "gfx900", "-code-object-v3", "rocdl.hsaco")); + }); +} +} // namespace mlir +#endif diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -38,6 +38,7 @@ config.run_cuda_tests = @MLIR_CUDA_CONVERSIONS_ENABLED@ config.cuda_wrapper_library_dir = "@MLIR_CUDA_WRAPPER_LIBRARY_DIR@" config.enable_cuda_runner = @MLIR_CUDA_RUNNER_ENABLED@ +config.run_rocm_tests = @MLIR_ROCM_CONVERSIONS_ENABLED@ config.vulkan_wrapper_library_dir = "@MLIR_VULKAN_WRAPPER_LIBRARY_DIR@" config.enable_vulkan_runner = @MLIR_VULKAN_RUNNER_ENABLED@ diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp --- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp +++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp @@ -15,7 +15,6 @@ #include "llvm/ADT/STLExtras.h" #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" -#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" @@ -30,6 +29,7 @@ #include "mlir/InitAllDialects.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" +#include "mlir/Target/NVVMIR.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/Passes.h" #include "llvm/Support/InitLLVM.h" @@ -57,8 +57,8 @@ } \ } -OwnedCubin compilePtxToCubin(const std::string ptx, Location loc, - StringRef name) { +OwnedBlob compilePtxToCubin(const std::string ptx, Location loc, + StringRef name) { char jitErrorBuffer[4096] = {0}; RETURN_ON_CUDA_ERROR(cuInit(0), "cuInit"); @@ -97,7 +97,7 @@ "cuLinkComplete"); char *cubinAsChar = static_cast(cubinData); - OwnedCubin result = + OwnedBlob result = std::make_unique>(cubinAsChar, cubinAsChar + cubinSize); // This will also destroy the cubin data. @@ -114,7 +114,9 @@ auto &kernelPm = pm.nest(); kernelPm.addPass(createStripDebugInfoPass()); kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass()); - kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin)); + kernelPm.addPass(createConvertGPUKernelToBlobPass( + translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda", + "sm_35", "+ptx60", "nvvm.cubin")); pm.addPass(createLowerToLLVMPass()); pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass()); @@ -127,6 +129,13 @@ llvm::InitLLVM y(argc, argv); llvm::InitializeNativeTarget(); llvm::InitializeNativeTargetAsmPrinter(); + + // Initialize LLVM NVPTX backend. + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTargetMC(); + LLVMInitializeNVPTXAsmPrinter(); + mlir::initializeLLVMPasses(); return mlir::JitRunnerMain(argc, argv, &runMLIRPasses); } diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -46,6 +46,7 @@ void registerTestCallGraphPass(); void registerTestConstantFold(); void registerTestConvertGPUKernelToCubinPass(); +void registerTestConvertGPUKernelToHsacoPass(); void registerTestDominancePass(); void registerTestFunc(); void registerTestGpuMemoryPromotionPass(); @@ -112,6 +113,9 @@ registerTestConstantFold(); #if MLIR_CUDA_CONVERSIONS_ENABLED registerTestConvertGPUKernelToCubinPass(); +#endif +#if MLIR_ROCM_CONVERSIONS_ENABLED + registerTestConvertGPUKernelToHsacoPass(); #endif registerTestBufferPlacementPreparationPass(); registerTestDominancePass();