diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h --- a/mlir/include/mlir/Dialect/GPU/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Passes.h @@ -13,6 +13,7 @@ #ifndef MLIR_DIALECT_GPU_PASSES_H_ #define MLIR_DIALECT_GPU_PASSES_H_ +#include "mlir/Dialect/DLTI/DLTI.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Pass/Pass.h" @@ -25,7 +26,8 @@ namespace mlir { /// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into /// a separate kernel function. -std::unique_ptr> createGpuKernelOutliningPass(); +std::unique_ptr> createGpuKernelOutliningPass( + DataLayoutSpecInterface dataLayoutSpec = DataLayoutSpecAttr()); /// Rewrites a function region so that GPU ops execute asynchronously. std::unique_ptr> createGpuAsyncRegionPass(); diff --git a/mlir/include/mlir/Dialect/GPU/Passes.td b/mlir/include/mlir/Dialect/GPU/Passes.td --- a/mlir/include/mlir/Dialect/GPU/Passes.td +++ b/mlir/include/mlir/Dialect/GPU/Passes.td @@ -14,6 +14,7 @@ def GpuKernelOutlining : Pass<"gpu-kernel-outlining", "ModuleOp"> { let summary = "Outline gpu.launch bodies to kernel functions"; let constructor = "mlir::createGpuKernelOutliningPass()"; + let dependentDialects = ["mlir::DLTIDialect"]; } def GpuAsyncRegionPass : FunctionPass<"gpu-async-region"> { diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -12,6 +12,7 @@ #include "PassDetail.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" +#include "mlir/Dialect/DLTI/DLTI.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/GPU/Passes.h" #include "mlir/Dialect/GPU/Utils.h" @@ -227,6 +228,17 @@ } namespace { + +/// Return a data layout specification attribute with an entry that sets the +/// Index type size to 32-bit. +DataLayoutSpecAttr create32bitIndexTypeDataLayout(MLIRContext *context) { + auto intAttr32 = IntegerAttr::get( + IntegerType::get(context, /*width=*/32, IntegerType::Signless), + /*value=*/32); + return DataLayoutSpecAttr::get( + context, {DataLayoutEntryAttr::get(IndexType::get(context), intAttr32)}); +} + /// Pass that moves the kernel of each LaunchOp into its separate nested module. /// /// This pass moves the kernel code of each LaunchOp into a function created @@ -239,7 +251,21 @@ class GpuKernelOutliningPass : public GpuKernelOutliningBase { public: + GpuKernelOutliningPass(DataLayoutSpecInterface dlSpec) + : dataLayoutSpec(dlSpec) {} + + GpuKernelOutliningPass(const GpuKernelOutliningPass &other) + : dataLayoutSpec(other.dataLayoutSpec) { + test32bitIndexTypeDL = other.test32bitIndexTypeDL; + } + void runOnOperation() override { + // If the test flag was provided, overwrites the data layout specification + // with a new one that sets the Index type size to 32-bit. + if (test32bitIndexTypeDL) + dataLayoutSpec = + create32bitIndexTypeDataLayout(getOperation().getContext()); + SymbolTable symbolTable(getOperation()); bool modified = false; for (auto func : getOperation().getOps()) { @@ -278,6 +304,14 @@ UnitAttr::get(&getContext())); } + Option test32bitIndexTypeDL{ + *this, "test-32bit-index-type-dl", + llvm::cl::desc( + "Create a kernel module and attach a data layout specification with " + "index type set to 32 bits. This flag is only intended to be used " + "for testing purposes"), + llvm::cl::init(false)}; + private: /// Returns a gpu.module containing kernelFunc and all callees (recursive). gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc, @@ -290,6 +324,12 @@ OpBuilder builder(context); auto kernelModule = builder.create(kernelFunc.getLoc(), kernelFunc.getName()); + + // If a valid data layout spec was provided, attach it to the kernel module. + // Otherwise, the default data layout will be used. + if (dataLayoutSpec) + kernelModule->setAttr("dlspec", dataLayoutSpec); + SymbolTable symbolTable(kernelModule); symbolTable.insert(kernelFunc); @@ -313,10 +353,13 @@ return kernelModule; } + + DataLayoutSpecInterface dataLayoutSpec; }; } // namespace -std::unique_ptr> mlir::createGpuKernelOutliningPass() { - return std::make_unique(); +std::unique_ptr> +mlir::createGpuKernelOutliningPass(DataLayoutSpecInterface dataLayoutSpec) { + return std::make_unique(dataLayoutSpec); } diff --git a/mlir/lib/Dialect/GPU/Transforms/PassDetail.h b/mlir/lib/Dialect/GPU/Transforms/PassDetail.h --- a/mlir/lib/Dialect/GPU/Transforms/PassDetail.h +++ b/mlir/lib/Dialect/GPU/Transforms/PassDetail.h @@ -10,6 +10,7 @@ #define DIALECT_GPU_TRANSFORMS_PASSDETAIL_H_ #include "mlir/Dialect/Async/IR/Async.h" +#include "mlir/Dialect/DLTI/DLTI.h" #include "mlir/Pass/Pass.h" namespace mlir { diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt -allow-unregistered-dialect -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect -gpu-kernel-outlining=test-32bit-index-type-dl -split-input-file %s | FileCheck --check-prefix CHECK-DL %s // CHECK: module attributes {gpu.container_module} @@ -35,8 +36,9 @@ return } +// CHECK-DL-LABEL: gpu.module @launch_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry>} -// CHECK-LABEL: module @launch_kernel +// CHECK-LABEL: gpu.module @launch_kernel // CHECK-NEXT: gpu.func @launch_kernel // CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref) // CHECK-NEXT: %[[BID:.*]] = "gpu.block_id"() {dimension = "x"} : () -> index @@ -81,7 +83,10 @@ return } -// CHECK: module @multiple_launches_kernel +// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry>} +// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel_0 attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry>} + +// CHECK: gpu.module @multiple_launches_kernel // CHECK: func @multiple_launches_kernel // CHECK: module @multiple_launches_kernel_0 // CHECK: func @multiple_launches_kernel @@ -106,6 +111,8 @@ return } +// CHECK-DL-LABEL: gpu.module @extra_constants_not_inlined_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry>} + // CHECK-LABEL: func @extra_constants_not_inlined_kernel(%{{.*}}: memref, %{{.*}}: index) // CHECK: arith.constant 2 @@ -130,6 +137,8 @@ return } +// CHECK-DL-LABEL: gpu.module @extra_constants_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry>} + // CHECK-LABEL: func @extra_constants_kernel( // CHECK-SAME: %[[KARG0:.*]]: memref // CHECK: arith.constant 2 @@ -158,6 +167,8 @@ return } +// CHECK-DL-LABEL: gpu.module @extra_constants_noarg_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry>} + // CHECK-LABEL: func @extra_constants_noarg_kernel( // CHECK-SAME: %[[KARG0:.*]]: memref, %[[KARG1:.*]]: index // CHECK: %[[KCST:.*]] = arith.constant 2 @@ -186,6 +197,8 @@ return } +// CHECK-DL-LABEL: gpu.module @multiple_uses_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry>} + // ----- // CHECK-LABEL: @multiple_uses2 @@ -213,6 +226,8 @@ return } +// CHECK-DL-LABEL: gpu.module @multiple_uses2_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry>} + // ----- llvm.mlir.global internal @global(42 : i64) : i64 @@ -242,6 +257,8 @@ return } +// CHECK-DL-LABEL: gpu.module @function_call_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry>} + // CHECK: gpu.module @function_call_kernel { // CHECK: gpu.func @function_call_kernel() // CHECK: call @device_function() : () -> () diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -3000,6 +3000,7 @@ deps = [ ":ArithmeticDialect", ":Async", + ":DLTIDialect", ":GPUDialect", ":GPUPassIncGen", ":MemRefDialect",