diff --git a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h --- a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h +++ b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h @@ -19,12 +19,17 @@ class Location; class ModuleOp; +template +class OpPassBase; + +namespace gpu { +class GPUModuleOp; +} // namespace gpu + namespace LLVM { class LLVMDialect; } // namespace LLVM -template class OpPassBase; - using OwnedCubin = std::unique_ptr>; using CubinGenerator = std::function; @@ -38,7 +43,7 @@ /// attached as a string attribute named 'nvvm.cubin' to the kernel function. /// After the transformation, the body of the kernel function is removed (i.e., /// it is turned into a declaration). -std::unique_ptr> +std::unique_ptr> createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator); /// Creates a pass to convert a gpu.launch_func operation into a sequence of diff --git a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h --- a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h +++ b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h @@ -14,15 +14,19 @@ class LLVMTypeConverter; class OwningRewritePatternList; -class ModuleOp; -template class OpPassBase; +template +class OpPassBase; + +namespace gpu { +class GPUModuleOp; +} /// Collect a set of patterns to convert from the GPU dialect to NVVM. void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter, OwningRewritePatternList &patterns); /// Creates a pass that lowers GPU dialect operations to NVVM counterparts. -std::unique_ptr> createLowerGpuOpsToNVVMOpsPass(); +std::unique_ptr> createLowerGpuOpsToNVVMOpsPass(); } // namespace mlir diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h --- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h +++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h @@ -12,11 +12,13 @@ namespace mlir { -class ModuleOp; +namespace gpu { +class GPUModuleOp; +} // namespace gpu template class OpPassBase; /// Creates a pass that lowers GPU dialect operations to ROCDL counterparts. -std::unique_ptr> createLowerGpuOpsToROCDLOpsPass(); +std::unique_ptr> createLowerGpuOpsToROCDLOpsPass(); } // namespace mlir diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -588,4 +588,56 @@ let printer = [{ p << getOperationName(); }]; } +def GPU_GPUModuleOp : GPU_Op<"module", [ + IsolatedFromAbove, SymbolTable, Symbol, + SingleBlockImplicitTerminator<"ModuleEndOp"> +]> { + let summary = "A top level compilation unit containing code to be run on a GPU."; + let description = [{ + GPU module contains code that is intended to be run on a GPU. A host device + can launch this code through a gpu.launc_func that creates a fully + qualified symbol through the gpu.module's symbol and a gpu.func symbol + contained in the gpu.module. + + The module's top-level scope is modeled by a single region with a single + block. GPU modules are required to have a name that is used for symbol + resolution by the gpu.launch_func operation. + + Using an op with a region to define a GPU module enables "embedding" GPU + modules with SIMT execution models in other dialects in a clean manner and + allows filtering of code regions to execute passes on only code intended to + or not intended to be run on the separate device. + + ``` + gpu.module @symbol_name { + gpu.func {} + ... + gpu.module_end + } + + ``` + }]; + let builders = [OpBuilder<"Builder *builder, OperationState &result, " + "StringRef name">]; + let parser = [{ return ::parseGPUModuleOp(parser, result); }]; + let printer = [{ return ::print(p, *this); }]; + let regions = (region SizedRegion<1>:$body); + + // We need to ensure the block inside the region is properly terminated; + // the auto-generated builders do not guarantee that. + let skipDefaultBuilders = 1; +} + +def GPU_ModuleEndOp : GPU_Op<"module_end", [ + Terminator, HasParent<"GPUModuleOp"> +]> { + let summary = "A pseudo op that marks the end of a gpu.module."; + let description = [{ + This op terminates the only block inside the only region of a `gpu.module`. + }]; + + let parser = [{ return success(); }]; + let printer = [{ p << getOperationName(); }]; +} + #endif // GPU_OPS diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp --- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp @@ -46,18 +46,15 @@ /// IR and further to PTX. A user provided CubinGenerator compiles the PTX to /// GPU binary code, which is then attached as an attribute to the function. The /// function body is erased. -class GpuKernelToCubinPass : public ModulePass { +class GpuKernelToCubinPass + : public OperationPass { public: GpuKernelToCubinPass( CubinGenerator cubinGenerator = compilePtxToCubinForTesting) : cubinGenerator(cubinGenerator) {} - void runOnModule() override { - ModuleOp module = getModule(); - if (!module.getAttrOfType( - gpu::GPUDialect::getKernelModuleAttrName()) || - !module.getName()) - return; + void runOnOperation() override { + gpu::GPUModuleOp module = getOperation(); // Make sure the NVPTX target is initialized. LLVMInitializeNVPTXTarget(); @@ -71,8 +68,8 @@ // Translate the module to CUBIN and attach the result as attribute to the // module. - if (auto cubinAttr = translateGpuModuleToCubinAnnotation( - *llvmModule, module.getLoc(), *module.getName())) + if (auto cubinAttr = translateGPUModuleToCubinAnnotation( + *llvmModule, module.getLoc(), module.getName())) module.setAttr(kCubinAnnotation, cubinAttr); else signalPassFailure(); @@ -92,7 +89,7 @@ StringRef name); /// Translates llvmModule to cubin and returns the result as attribute. - StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule, + StringAttr translateGPUModuleToCubinAnnotation(llvm::Module &llvmModule, Location loc, StringRef name); CubinGenerator cubinGenerator; @@ -149,7 +146,7 @@ return cubinGenerator(ptx, loc, name); } -StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation( +StringAttr GpuKernelToCubinPass::translateGPUModuleToCubinAnnotation( llvm::Module &llvmModule, Location loc, StringRef name) { auto cubin = convertModuleToCubin(llvmModule, loc, name); if (!cubin) @@ -157,7 +154,7 @@ return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext()); } -std::unique_ptr> +std::unique_ptr> mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) { return std::make_unique(cubinGenerator); } diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp --- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp @@ -132,9 +132,9 @@ // GPU kernel modules are no longer necessary since we have a global // constant with the CUBIN data. - for (auto m : llvm::make_early_inc_range(getModule().getOps())) - if (m.getAttrOfType(gpu::GPUDialect::getKernelModuleAttrName())) - m.erase(); + for (auto m : + llvm::make_early_inc_range(getModule().getOps())) + m.erase(); } private: @@ -343,8 +343,8 @@ builder.getI32IntegerAttr(0)); // Create an LLVM global with CUBIN extracted from the kernel annotation and // obtain a pointer to the first byte in it. - auto kernelModule = - getModule().lookupSymbol(launchOp.getKernelModuleName()); + auto kernelModule = getModule().lookupSymbol( + launchOp.getKernelModuleName()); assert(kernelModule && "expected a kernel module"); auto cubinAttr = kernelModule.getAttrOfType(kCubinAnnotation); @@ -354,8 +354,7 @@ return signalPassFailure(); } - assert(kernelModule.getName() && "expected a named module"); - SmallString<128> nameBuffer(*kernelModule.getName()); + SmallString<128> nameBuffer(kernelModule.getName()); nameBuffer.append(kCubinStorageSuffix); Value data = LLVM::createGlobalString( loc, builder, nameBuffer.str(), cubinAttr.getValue(), diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -200,7 +200,7 @@ auto type = operand.getType().cast(); // Create shared memory array to store the warp reduction. - auto module = operand.getDefiningOp()->getParentOfType(); + auto module = operand.getDefiningOp()->getParentOfType(); assert(module && "op must belong to a module"); Value sharedMemPtr = createSharedMemoryArray(loc, module, type, kWarpSize, rewriter); @@ -391,10 +391,10 @@ } /// Creates a global array stored in shared memory. - Value createSharedMemoryArray(Location loc, ModuleOp module, + Value createSharedMemoryArray(Location loc, gpu::GPUModuleOp module, LLVM::LLVMType elementType, int numElements, ConversionPatternRewriter &rewriter) const { - OpBuilder builder(module.getBodyRegion()); + OpBuilder builder(module.body()); auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements); StringRef name = "reduce_buffer"; @@ -699,13 +699,11 @@ /// /// This pass only handles device code and is not meant to be run on GPU host /// code. -class LowerGpuOpsToNVVMOpsPass : public ModulePass { +class LowerGpuOpsToNVVMOpsPass + : public OperationPass { public: - void runOnModule() override { - ModuleOp m = getModule(); - if (!m.getAttrOfType(gpu::GPUDialect::getKernelModuleAttrName())) - return; - + void runOnOperation() override { + gpu::GPUModuleOp m = getOperation(); OwningRewritePatternList patterns; NVVMTypeConverter converter(m.getContext()); populateStdToLLVMConversionPatterns(converter, patterns); @@ -718,7 +716,7 @@ target.addLegalDialect(); target.addLegalDialect(); // TODO(csigg): Remove once we support replacing non-root ops. - target.addLegalOp(); + target.addLegalOp(); if (failed(applyPartialConversion(m, target, patterns, &converter))) signalPassFailure(); } @@ -750,7 +748,8 @@ "__nv_exp"); } -std::unique_ptr> mlir::createLowerGpuOpsToNVVMOpsPass() { +std::unique_ptr> +mlir::createLowerGpuOpsToNVVMOpsPass() { return std::make_unique(); } diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -31,12 +31,11 @@ // // This pass only handles device code and is not meant to be run on GPU host // code. -class LowerGpuOpsToROCDLOpsPass : public ModulePass { +class LowerGpuOpsToROCDLOpsPass + : public OperationPass { public: - void runOnModule() override { - ModuleOp m = getModule(); - if (!m.getAttrOfType(gpu::GPUDialect::getKernelModuleAttrName())) - return; + void runOnOperation() override { + gpu::GPUModuleOp m = getOperation(); OwningRewritePatternList patterns; LLVMTypeConverter converter(m.getContext()); @@ -73,7 +72,8 @@ } // anonymous namespace -std::unique_ptr> mlir::createLowerGpuOpsToROCDLOpsPass() { +std::unique_ptr> +mlir::createLowerGpuOpsToROCDLOpsPass() { return std::make_unique(); } diff --git a/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt --- a/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt @@ -1,8 +1,15 @@ +set(LLVM_TARGET_DEFINITIONS GPUToSPIRV.td) +mlir_tablegen(GPUToSPIRV.cpp.inc -gen-rewriters) +add_public_tablegen_target(MLIRGPUToSPIRVIncGen) + add_llvm_library(MLIRGPUtoSPIRVTransforms ConvertGPUToSPIRV.cpp ConvertGPUToSPIRVPass.cpp ) +add_dependencies(MLIRGPUtoSPIRVTransforms + MLIRGPUToSPIRVIncGen) + target_link_libraries(MLIRGPUtoSPIRVTransforms MLIRGPU MLIRIR diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp --- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp @@ -63,27 +63,13 @@ SmallVector workGroupSizeAsInt32; }; -/// Pattern to convert a module with gpu.kernel_module attribute to a -/// spv.module. -class KernelModuleConversion final : public SPIRVOpLowering { +/// Pattern to convert a gpu.module to a spv.module. +class GPUModuleConversion final : public SPIRVOpLowering { public: - using SPIRVOpLowering::SPIRVOpLowering; + using SPIRVOpLowering::SPIRVOpLowering; PatternMatchResult - matchAndRewrite(ModuleOp moduleOp, ArrayRef operands, - ConversionPatternRewriter &rewriter) const override; -}; - -/// Pattern to convert a module terminator op to a terminator of spv.module op. -// TODO: Move this into DRR, but that requires ModuleTerminatorOp to be defined -// in ODS. -class KernelModuleTerminatorConversion final - : public SPIRVOpLowering { -public: - using SPIRVOpLowering::SPIRVOpLowering; - - PatternMatchResult - matchAndRewrite(ModuleTerminatorOp terminatorOp, ArrayRef operands, + matchAndRewrite(gpu::GPUModuleOp moduleOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const override; }; @@ -284,16 +270,12 @@ } //===----------------------------------------------------------------------===// -// ModuleOp with gpu.kernel_module. +// ModuleOp with gpu.module. //===----------------------------------------------------------------------===// -PatternMatchResult KernelModuleConversion::matchAndRewrite( - ModuleOp moduleOp, ArrayRef operands, +PatternMatchResult GPUModuleConversion::matchAndRewrite( + gpu::GPUModuleOp moduleOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const { - if (!moduleOp.getAttrOfType( - gpu::GPUDialect::getKernelModuleAttrName())) { - return matchFailure(); - } // TODO : Generalize this to account for different extensions, // capabilities, extended_instruction_sets, other addressing models // and memory models. @@ -302,8 +284,8 @@ spirv::MemoryModel::GLSL450, spirv::Capability::Shader, spirv::Extension::SPV_KHR_storage_buffer_storage_class); // Move the region from the module op into the SPIR-V module. - Region &spvModuleRegion = spvModule.getOperation()->getRegion(0); - rewriter.inlineRegionBefore(moduleOp.getBodyRegion(), spvModuleRegion, + Region &spvModuleRegion = spvModule.body(); + rewriter.inlineRegionBefore(moduleOp.body(), spvModuleRegion, spvModuleRegion.begin()); // The spv.module build method adds a block with a terminator. Remove that // block. The terminator of the module op in the remaining block will be @@ -313,17 +295,6 @@ return matchSuccess(); } -//===----------------------------------------------------------------------===// -// ModuleTerminatorOp for gpu.kernel_module. -//===----------------------------------------------------------------------===// - -PatternMatchResult KernelModuleTerminatorConversion::matchAndRewrite( - ModuleTerminatorOp terminatorOp, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { - rewriter.replaceOpWithNewOp(terminatorOp); - return matchSuccess(); -} - //===----------------------------------------------------------------------===// // GPU return inside kernel functions to SPIR-V return. //===----------------------------------------------------------------------===// @@ -342,14 +313,18 @@ // GPU To SPIRV Patterns. //===----------------------------------------------------------------------===// +namespace { +#include "GPUToSPIRV.cpp.inc" +} + void mlir::populateGPUToSPIRVPatterns(MLIRContext *context, SPIRVTypeConverter &typeConverter, OwningRewritePatternList &patterns, ArrayRef workGroupSize) { + populateWithGenerated(context, &patterns); patterns.insert(context, typeConverter, workGroupSize); patterns.insert< - GPUReturnOpConversion, ForOpConversion, KernelModuleConversion, - KernelModuleTerminatorConversion, + GPUReturnOpConversion, ForOpConversion, GPUModuleConversion, LaunchConfigConversion, LaunchConfigConversion, LaunchConfigConversion, diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp --- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp @@ -60,15 +60,12 @@ SmallVector kernelModules; OpBuilder builder(context); - module.walk([&builder, &kernelModules](ModuleOp moduleOp) { - if (moduleOp.getAttrOfType( - gpu::GPUDialect::getKernelModuleAttrName())) { - // For each kernel module (should be only 1 for now, but that is not a - // requirement here), clone the module for conversion because the - // gpu.launch function still needs the kernel module. - builder.setInsertionPoint(moduleOp.getOperation()); - kernelModules.push_back(builder.clone(*moduleOp.getOperation())); - } + module.walk([&builder, &kernelModules](gpu::GPUModuleOp moduleOp) { + // For each kernel module (should be only 1 for now, but that is not a + // requirement here), clone the module for conversion because the + // gpu.launch function still needs the kernel module. + builder.setInsertionPoint(moduleOp.getOperation()); + kernelModules.push_back(builder.clone(*moduleOp.getOperation())); }); SPIRVTypeConverter typeConverter; diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.td b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.td new file mode 100644 --- /dev/null +++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.td @@ -0,0 +1,22 @@ +//===-- GPUToSPIRV.td - GPU to SPIR-V Dialect Lowerings ----*- tablegen -*-===// +// +// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains patterns to lower GPU dialect ops to to SPIR-V ops. +// +//===----------------------------------------------------------------------===// + + +#ifndef CONVERT_GPU_TO_SPIRV +#define CONVERT_GPU_TO_SPIRV + +include "mlir/Dialect/GPU/GPUOps.td" +include "mlir/Dialect/SPIRV/SPIRVStructureOps.td" + +def : Pat<(GPU_ModuleEndOp), (SPV_ModuleEndOp)>; + +#endif // CONVERT_GPU_TO_SPIRV diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -72,15 +72,10 @@ // Check that `launch_func` refers to a well-formed GPU kernel module. StringRef kernelModuleName = launchOp.getKernelModuleName(); - auto kernelModule = module.lookupSymbol(kernelModuleName); + auto kernelModule = module.lookupSymbol(kernelModuleName); if (!kernelModule) return launchOp.emitOpError() << "kernel module '" << kernelModuleName << "' is undefined"; - if (!kernelModule.getAttrOfType( - GPUDialect::getKernelModuleAttrName())) - return launchOp.emitOpError("module '") - << kernelModuleName << "' is missing the '" - << GPUDialect::getKernelModuleAttrName() << "' attribute"; // Check that `launch_func` refers to a well-formed kernel function. StringRef kernelName = launchOp.kernel(); @@ -517,10 +512,9 @@ result.addOperands(kernelOperands); result.addAttribute(getKernelAttrName(), builder->getStringAttr(kernelFunc.getName())); - auto kernelModule = kernelFunc.getParentOfType(); - if (Optional kernelModuleName = kernelModule.getName()) - result.addAttribute(getKernelModuleAttrName(), - builder->getSymbolRefAttr(*kernelModuleName)); + auto kernelModule = kernelFunc.getParentOfType(); + result.addAttribute(getKernelModuleAttrName(), + builder->getSymbolRefAttr(kernelModule.getName())); } void LaunchFuncOp::build(Builder *builder, OperationState &result, @@ -820,6 +814,47 @@ return success(); } +//===----------------------------------------------------------------------===// +// GPUModuleOp +//===----------------------------------------------------------------------===// + +void GPUModuleOp::build(Builder *builder, OperationState &result, + StringRef name) { + ensureTerminator(*result.addRegion(), *builder, result.location); + result.attributes.push_back(builder->getNamedAttr( + ::mlir::SymbolTable::getSymbolAttrName(), builder->getStringAttr(name))); +} + +static ParseResult parseGPUModuleOp(OpAsmParser &parser, + OperationState &result) { + StringAttr nameAttr; + if (parser.parseSymbolName(nameAttr, SymbolTable::getSymbolAttrName(), + result.attributes)) + return failure(); + + // If module attributes are present, parse them. + if (parser.parseOptionalAttrDictWithKeyword(result.attributes)) + return failure(); + + // Parse the module body. + auto *body = result.addRegion(); + if (parser.parseRegion(*body, None, None)) + return failure(); + + // Ensure that this module has a valid terminator. + GPUModuleOp::ensureTerminator(*body, parser.getBuilder(), result.location); + return success(); +} + +static void print(OpAsmPrinter &p, GPUModuleOp op) { + p << op.getOperationName() << ' '; + p.printSymbolName(op.getName()); + p.printOptionalAttrDictWithKeyword(op.getAttrs(), + {SymbolTable::getSymbolAttrName()}); + p.printRegion(op.getOperation()->getRegion(0), /*printEntryBlockArgs=*/false, + /*printBlockTerminators=*/false); +} + // Namespace avoids ambiguous ReturnOpOperandAdaptor. namespace mlir { namespace gpu { diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -140,8 +140,8 @@ /// inside a nested module. It also creates an external function of the same /// name in the parent module. /// -/// The kernel modules are intended to be compiled to a cubin blob independently -/// in a separate pass. The external functions can then be annotated with the +/// The gpu.modules are intended to be compiled to a cubin blob independently in +/// a separate pass. The external functions can then be annotated with the /// symbol of the cubin accessor function. class GpuKernelOutliningPass : public ModulePass { public: @@ -174,15 +174,19 @@ } private: - // Returns a module containing kernelFunc and all callees (recursive). - ModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc, - const SymbolTable &parentSymbolTable) { + // Returns a gpu.module containing kernelFunc and all callees (recursive). + gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc, + const SymbolTable &parentSymbolTable) { + // TODO: This code cannot use an OpBuilder because it must be inserted into + // a SymbolTable by the caller. SymbolTable needs to be refactored to + // prevent manual building of Ops with symbols in code using SymbolTables + // and then this needs to use the OpBuilder. auto context = getModule().getContext(); Builder builder(context); - auto kernelModule = - ModuleOp::create(builder.getUnknownLoc(), kernelFunc.getName()); - kernelModule.setAttr(gpu::GPUDialect::getKernelModuleAttrName(), - builder.getUnitAttr()); + OperationState state(kernelFunc.getLoc(), + gpu::GPUModuleOp::getOperationName()); + gpu::GPUModuleOp::build(&builder, state, kernelFunc.getName()); + auto kernelModule = cast(Operation::create(state)); SymbolTable symbolTable(kernelModule); symbolTable.insert(kernelFunc); diff --git a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir --- a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir +++ b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir @@ -5,7 +5,7 @@ // CHECK: llvm.mlir.global internal constant @[[kernel_name:.*]]("kernel\00") // CHECK: llvm.mlir.global internal constant @[[global:.*]]("CUBIN") - module @kernel_module attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"} { + gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN"} { gpu.func @kernel(%arg0: !llvm.float, %arg1: !llvm<"float*">) attributes {gpu.kernel} { gpu.return } diff --git a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir --- a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir +++ b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s --test-kernel-to-cubin -split-input-file | FileCheck %s -// CHECK: attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"} -module @foo attributes {gpu.kernel_module} { +// CHECK: attributes {nvvm.cubin = "CUBIN"} +gpu.module @foo { llvm.func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">) // CHECK: attributes {gpu.kernel} attributes { gpu.kernel } { @@ -11,7 +11,7 @@ // ----- -module @bar attributes {gpu.kernel_module} { +gpu.module @bar { // CHECK: func @kernel_a llvm.func @kernel_a() attributes { gpu.kernel } { diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir --- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt %s -convert-gpu-to-nvvm -split-input-file | FileCheck %s -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK-LABEL: func @gpu_index_ops() func @gpu_index_ops() attributes { gpu.kernel } { @@ -38,7 +38,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK-LABEL: func @gpu_all_reduce_op() func @gpu_all_reduce_op() attributes { gpu.kernel } { @@ -55,7 +55,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK-LABEL: func @gpu_all_reduce_region() func @gpu_all_reduce_region() attributes { gpu.kernel } { @@ -74,7 +74,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK-LABEL: func @gpu_shuffle() func @gpu_shuffle() attributes { gpu.kernel } { @@ -99,7 +99,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK-LABEL: func @gpu_sync() func @gpu_sync() attributes { gpu.kernel } { @@ -111,7 +111,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK: llvm.func @__nv_fabsf(!llvm.float) -> !llvm.float // CHECK: llvm.func @__nv_fabs(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_fabs @@ -126,7 +126,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK: llvm.func @__nv_ceilf(!llvm.float) -> !llvm.float // CHECK: llvm.func @__nv_ceil(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_ceil @@ -141,7 +141,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK: llvm.func @__nv_cosf(!llvm.float) -> !llvm.float // CHECK: llvm.func @__nv_cos(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_cos @@ -156,7 +156,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @test_module { // CHECK: llvm.func @__nv_expf(!llvm.float) -> !llvm.float // CHECK: llvm.func @__nv_exp(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_exp @@ -174,7 +174,7 @@ // ----- // Test that we handled properly operation with SymbolTable other than module op -module attributes {gpu.kernel_module} { +gpu.module @test_module { "test.symbol_scope"() ({ // CHECK: test.symbol_scope // CHECK: llvm.func @__nv_expf(!llvm.float) -> !llvm.float diff --git a/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir b/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir --- a/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir +++ b/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s -module attributes {gpu.kernel_module} { +gpu.module @kernel { // CHECK-LABEL: llvm.func @private gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) { // Allocate private memory inside the function. @@ -32,7 +32,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @kernel { // Workgroup buffers are allocated as globals. // CHECK: llvm.mlir.global internal @[[buffer:.*]]() // CHECK-SAME: addr_space = 3 @@ -72,7 +72,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @kernel { // Check that the total size was computed correctly. // CHECK: llvm.mlir.global internal @[[buffer:.*]]() // CHECK-SAME: addr_space = 3 @@ -113,7 +113,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @kernel { // Check that several buffers are defined. // CHECK: llvm.mlir.global internal @[[buffer1:.*]]() // CHECK-SAME: !llvm<"[1 x float]"> diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s -module attributes {gpu.kernel_module} { +gpu.module @kernel_module { // CHECK-LABEL: func @gpu_index_ops() func @gpu_index_ops() attributes { gpu.kernel } { @@ -38,7 +38,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @kernel_module { // CHECK: llvm.func @_ocml_fabs_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @_ocml_fabs_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_fabs @@ -53,7 +53,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @kernel_module { // CHECK: llvm.func @_ocml_ceil_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @_ocml_ceil_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_ceil @@ -68,7 +68,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @kernel_module { // CHECK: llvm.func @_ocml_cos_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @_ocml_cos_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_cos @@ -83,7 +83,7 @@ // ----- -module attributes {gpu.kernel_module} { +gpu.module @kernel_module { // CHECK: llvm.func @_ocml_exp_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @_ocml_exp_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_exp @@ -102,7 +102,7 @@ // ----- // Test that we handled properly operation with SymbolTable other than module op -module attributes {gpu.kernel_module} { +gpu.module @kernel_module { "test.symbol_scope"() ({ // CHECK: test.symbol_scope // CHECK: llvm.func @_ocml_exp_f32(!llvm.float) -> !llvm.float diff --git a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir --- a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir @@ -9,7 +9,7 @@ // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId") - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @builtin_workgroup_id_x() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]] @@ -32,7 +32,7 @@ // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId") - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @builtin_workgroup_id_y() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]] @@ -55,7 +55,7 @@ // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId") - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @builtin_workgroup_id_z() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]] @@ -78,7 +78,7 @@ // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[WORKGROUPSIZE:@.*]] built_in("WorkgroupSize") - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @builtin_workgroup_size_x() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPSIZE]] @@ -101,7 +101,7 @@ // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[LOCALINVOCATIONID:@.*]] built_in("LocalInvocationId") - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @builtin_local_id_x() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[LOCALINVOCATIONID]] @@ -124,7 +124,7 @@ // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[NUMWORKGROUPS:@.*]] built_in("NumWorkgroups") - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @builtin_num_workgroups_x() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[NUMWORKGROUPS]] diff --git a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir --- a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir @@ -16,7 +16,7 @@ } // CHECK-LABEL: spv.module "Logical" "GLSL450" - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { // CHECK-DAG: spv.globalVariable [[WORKGROUPSIZEVAR:@.*]] built_in("WorkgroupSize") : !spv.ptr, Input> // CHECK-DAG: spv.globalVariable [[NUMWORKGROUPSVAR:@.*]] built_in("NumWorkgroups") : !spv.ptr, Input> // CHECK-DAG: spv.globalVariable [[LOCALINVOCATIONIDVAR:@.*]] built_in("LocalInvocationId") : !spv.ptr, Input> diff --git a/mlir/test/Conversion/GPUToSPIRV/loop.mlir b/mlir/test/Conversion/GPUToSPIRV/loop.mlir --- a/mlir/test/Conversion/GPUToSPIRV/loop.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/loop.mlir @@ -7,7 +7,7 @@ return } - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @loop_kernel(%arg2 : memref<10xf32>, %arg3 : memref<10xf32>) attributes {gpu.kernel} { // CHECK: [[LB:%.*]] = spv.constant 4 : i32 diff --git a/mlir/test/Conversion/GPUToSPIRV/simple.mlir b/mlir/test/Conversion/GPUToSPIRV/simple.mlir --- a/mlir/test/Conversion/GPUToSPIRV/simple.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/simple.mlir @@ -2,7 +2,7 @@ module attributes {gpu.container_module} { - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { // CHECK: spv.module "Logical" "GLSL450" { // CHECK-LABEL: func @kernel_1 // CHECK-SAME: {{%.*}}: f32 {spv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -167,7 +167,7 @@ } func @launch_func_missing_module_attribute(%sz : index) { - // expected-error@+1 {{module 'kernels' is missing the 'gpu.kernel_module' attribute}} + // expected-error@+1 {{kernel module 'kernels' is undefined}} "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) { kernel = "kernel_1", kernel_module = @kernels } : (index, index, index, index, index, index) -> () @@ -178,8 +178,7 @@ // ----- module attributes {gpu.container_module} { - module @kernels attributes {gpu.kernel_module} { - } + gpu.module @kernels { } func @launch_func_undefined_function(%sz : index) { // expected-error@+1 {{kernel function 'kernel_1' is undefined}} @@ -193,7 +192,7 @@ // ----- module attributes {gpu.container_module} { - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @kernel_1(%arg1 : !llvm<"float*">) kernel { gpu.return } @@ -211,7 +210,7 @@ // ----- module attributes {gpu.container_module} { - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } { gpu.return } @@ -229,7 +228,7 @@ // ----- -module @kernels attributes {gpu.kernel_module} { +gpu.module @kernels { gpu.func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } { gpu.return } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -60,7 +60,7 @@ return } - module @kernels attributes {gpu.kernel_module} { + gpu.module @kernels { gpu.func @kernel_1(%arg0 : f32, %arg1 : memref) attributes {gpu.kernel} { %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index) %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index) diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -136,7 +136,7 @@ gpu.return } -// CHECK: module @function_call_kernel attributes {gpu.kernel_module} { +// CHECK: gpu.module @function_call_kernel { // CHECK: gpu.func @function_call_kernel() // CHECK: call @device_function() : () -> () // CHECK: call @device_function() : () -> () diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp --- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp +++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp @@ -105,7 +105,7 @@ applyPassManagerCLOptions(pm); pm.addPass(createGpuKernelOutliningPass()); - auto &kernelPm = pm.nest(); + auto &kernelPm = pm.nest(); kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass()); kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin)); pm.addPass(createLowerToLLVMPass());