diff --git a/mlir/include/mlir/Dialect/GPU/GPUBase.td b/mlir/include/mlir/Dialect/GPU/GPUBase.td --- a/mlir/include/mlir/Dialect/GPU/GPUBase.td +++ b/mlir/include/mlir/Dialect/GPU/GPUBase.td @@ -33,9 +33,6 @@ /// functions. static StringRef getKernelFuncAttrName() { return "gpu.kernel"; } - /// Get the name of the attribute used to annotate kernel modules. - static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; } - /// Returns whether the given function is a kernel function, i.e., has the /// 'gpu.kernel' attribute. static bool isKernel(Operation *op); diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -85,7 +85,8 @@ }]; } -def GPU_GPUFuncOp : GPU_Op<"func", [AutomaticAllocationScope, FunctionLike, +def GPU_GPUFuncOp : GPU_Op<"func", [HasParent<"GPUModuleOp">, + AutomaticAllocationScope, FunctionLike, IsolatedFromAbove, Symbol]> { let summary = "Function executable on a GPU"; diff --git a/mlir/test/Dialect/GPU/all-reduce-max.mlir b/mlir/test/Dialect/GPU/all-reduce-max.mlir --- a/mlir/test/Dialect/GPU/all-reduce-max.mlir +++ b/mlir/test/Dialect/GPU/all-reduce-max.mlir @@ -1,8 +1,8 @@ // RUN: mlir-opt -test-all-reduce-lowering %s | FileCheck %s // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py -// CHECK: module @kernels attributes {gpu.kernel_module} { -module @kernels attributes {gpu.kernel_module} { +// CHECK: gpu.module @kernels { +gpu.module @kernels { // CHECK-LABEL: gpu.func @kernel( // CHECK-SAME: [[VAL_0:%.*]]: f32) workgroup([[VAL_1:%.*]] : memref<32xf32, 3>) kernel { diff --git a/mlir/test/Dialect/GPU/all-reduce.mlir b/mlir/test/Dialect/GPU/all-reduce.mlir --- a/mlir/test/Dialect/GPU/all-reduce.mlir +++ b/mlir/test/Dialect/GPU/all-reduce.mlir @@ -1,8 +1,8 @@ // RUN: mlir-opt -test-all-reduce-lowering %s | FileCheck %s // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py -// CHECK: module @kernels attributes {gpu.kernel_module} { -module @kernels attributes {gpu.kernel_module} { +// CHECK: gpu.module @kernels { +gpu.module @kernels { // CHECK-LABEL: gpu.func @kernel( // CHECK-SAME: [[VAL_0:%.*]]: f32) workgroup([[VAL_1:%.*]] : memref<32xf32, 3>) kernel { diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -109,6 +109,17 @@ // ----- +module attributes {gpu.container_module} { + module @kernels { + // expected-error@+1 {{'gpu.func' op expects parent op 'gpu.module'}} + gpu.func @kernel_1(%arg1 : !llvm<"float*">) { + gpu.return + } + } +} + +// ----- + module attributes {gpu.container_module} { module @kernels { } @@ -341,7 +352,7 @@ // ----- module { - module @gpu_funcs attributes {gpu.kernel_module} { + gpu.module @gpu_funcs { // expected-error @+1 {{requires 'type' attribute of function type}} "gpu.func"() ({ gpu.return @@ -352,7 +363,7 @@ // ----- module { - module @gpu_funcs attributes {gpu.kernel_module} { + gpu.module @gpu_funcs { // expected-error @+1 {{expected memref type in attribution}} gpu.func @kernel() workgroup(%0: i32) { gpu.return @@ -363,7 +374,7 @@ // ----- module { - module @gpu_funcs attributes {gpu.kernel_module} { + gpu.module @gpu_funcs { // expected-error @+1 {{expected memory space 3 in attribution}} gpu.func @kernel() workgroup(%0: memref<4xf32>) { gpu.return @@ -374,7 +385,7 @@ // ----- module { - module @gpu_funcs attributes {gpu.kernel_module} { + gpu.module @gpu_funcs { // expected-error @+1 {{expected memory space 5 in attribution}} gpu.func @kernel() private(%0: memref<4xf32>) { gpu.return @@ -385,7 +396,7 @@ // ----- module { - module @gpu_funcs attributes {gpu.kernel_module} { + gpu.module @gpu_funcs { // expected-error @+1 {{expected memory space 5 in attribution}} gpu.func @kernel() private(%0: memref<4xf32>) { gpu.return diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -83,7 +83,7 @@ return } - module @gpu_funcs attributes {gpu.kernel_module} { + gpu.module @gpu_funcs { // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32) // CHECK: workgroup // CHECK: private diff --git a/mlir/test/Dialect/GPU/promotion.mlir b/mlir/test/Dialect/GPU/promotion.mlir --- a/mlir/test/Dialect/GPU/promotion.mlir +++ b/mlir/test/Dialect/GPU/promotion.mlir @@ -1,119 +1,128 @@ // RUN: mlir-opt -allow-unregistered-dialect -test-gpu-memory-promotion -split-input-file %s | FileCheck %s -module @foo attributes {gpu.kernel_module} { - // Verify that the attribution was indeed introduced - // CHECK-LABEL: @memref3d - // CHECK-SAME: (%[[arg:.*]]: memref<5x4xf32> - // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<5x4xf32, 3>) - gpu.func @memref3d(%arg0: memref<5x4xf32> {gpu.test_promote_workgroup}) kernel { - // Verify that loop bounds are emitted, the order does not matter. - // CHECK-DAG: %[[c1:.*]] = constant 1 - // CHECK-DAG: %[[c4:.*]] = constant 4 - // CHECK-DAG: %[[c5:.*]] = constant 5 - // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"} - // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"} - // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"} - // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"} - // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"} - // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"} - - // Verify that loops for the copy are emitted. We only check the number of - // loops here since their bounds are produced by mapLoopToProcessorIds, - // tested separately. - // CHECK: loop.for %[[i0:.*]] = - // CHECK: loop.for %[[i1:.*]] = - // CHECK: loop.for %[[i2:.*]] = - - // Verify that the copy is emitted and uses only the last two loops. - // CHECK: %[[v:.*]] = load %[[arg]][%[[i1]], %[[i2]]] - // CHECK: store %[[v]], %[[promoted]][%[[i1]], %[[i2]]] - - // Verify that the use has been rewritten. - // CHECK: "use"(%[[promoted]]) : (memref<5x4xf32, 3>) - "use"(%arg0) : (memref<5x4xf32>) -> () - - - // Verify that loops for the copy are emitted. We only check the number of - // loops here since their bounds are produced by mapLoopToProcessorIds, - // tested separately. - // CHECK: loop.for %[[i0:.*]] = - // CHECK: loop.for %[[i1:.*]] = - // CHECK: loop.for %[[i2:.*]] = - - // Verify that the copy is emitted and uses only the last two loops. - // CHECK: %[[v:.*]] = load %[[promoted]][%[[i1]], %[[i2]]] - // CHECK: store %[[v]], %[[arg]][%[[i1]], %[[i2]]] - gpu.return +module { + gpu.module @foo { + + // Verify that the attribution was indeed introduced + // CHECK-LABEL: @memref3d + // CHECK-SAME: (%[[arg:.*]]: memref<5x4xf32> + // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<5x4xf32, 3>) + gpu.func @memref3d(%arg0: memref<5x4xf32> {gpu.test_promote_workgroup}) kernel { + // verify that loop bounds are emitted, the order does not matter. + // CHECK-DAG: %[[c1:.*]] = constant 1 + // CHECK-DAG: %[[c4:.*]] = constant 4 + // CHECK-DAG: %[[c5:.*]] = constant 5 + // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"} + // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"} + // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"} + // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"} + // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"} + // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"} + + // Verify that loops for the copy are emitted. We only check the number of + // loops here since their bounds are produced by mapLoopToProcessorIds, + // tested separately. + // CHECK: loop.for %[[i0:.*]] = + // CHECK: loop.for %[[i1:.*]] = + // CHECK: loop.for %[[i2:.*]] = + + // Verify that the copy is emitted and uses only the last two loops. + // CHECK: %[[v:.*]] = load %[[arg]][%[[i1]], %[[i2]]] + // CHECK: store %[[v]], %[[promoted]][%[[i1]], %[[i2]]] + + // Verify that the use has been rewritten. + // CHECK: "use"(%[[promoted]]) : (memref<5x4xf32, 3>) + "use"(%arg0) : (memref<5x4xf32>) -> () + + + // Verify that loops for the copy are emitted. We only check the number of + // loops here since their bounds are produced by mapLoopToProcessorIds, + // tested separately. + // CHECK: loop.for %[[i0:.*]] = + // CHECK: loop.for %[[i1:.*]] = + // CHECK: loop.for %[[i2:.*]] = + + // Verify that the copy is emitted and uses only the last two loops. + // CHECK: %[[v:.*]] = load %[[promoted]][%[[i1]], %[[i2]]] + // CHECK: store %[[v]], %[[arg]][%[[i1]], %[[i2]]] + gpu.return + } } } // ----- -module @foo attributes {gpu.kernel_module} { - // Verify that the attribution was indeed introduced - // CHECK-LABEL: @memref5d - // CHECK-SAME: (%[[arg:.*]]: memref<8x7x6x5x4xf32> - // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<8x7x6x5x4xf32, 3>) - gpu.func @memref5d(%arg0: memref<8x7x6x5x4xf32> {gpu.test_promote_workgroup}) kernel { - // Verify that loop bounds are emitted, the order does not matter. - // CHECK-DAG: %[[c0:.*]] = constant 0 - // CHECK-DAG: %[[c1:.*]] = constant 1 - // CHECK-DAG: %[[c4:.*]] = constant 4 - // CHECK-DAG: %[[c5:.*]] = constant 5 - // CHECK-DAG: %[[c6:.*]] = constant 6 - // CHECK-DAG: %[[c7:.*]] = constant 7 - // CHECK-DAG: %[[c8:.*]] = constant 8 - // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"} - // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"} - // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"} - // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"} - // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"} - // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"} - - // Verify that loops for the copy are emitted. - // CHECK: loop.for %[[i0:.*]] = - // CHECK: loop.for %[[i1:.*]] = - // CHECK: loop.for %[[i2:.*]] = - // CHECK: loop.for %[[i3:.*]] = - // CHECK: loop.for %[[i4:.*]] = - - // Verify that the copy is emitted. - // CHECK: %[[v:.*]] = load %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]] - // CHECK: store %[[v]], %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]] - - // Verify that the use has been rewritten. - // CHECK: "use"(%[[promoted]]) : (memref<8x7x6x5x4xf32, 3>) - "use"(%arg0) : (memref<8x7x6x5x4xf32>) -> () - - // Verify that loop loops for the copy are emitted. - // CHECK: loop.for %[[i0:.*]] = - // CHECK: loop.for %[[i1:.*]] = - // CHECK: loop.for %[[i2:.*]] = - // CHECK: loop.for %[[i3:.*]] = - // CHECK: loop.for %[[i4:.*]] = - - // Verify that the copy is emitted. - // CHECK: %[[v:.*]] = load %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]] - // CHECK: store %[[v]], %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]] - gpu.return +module { + gpu.module @foo { + + // Verify that the attribution was indeed introduced + // CHECK-LABEL: @memref5d + // CHECK-SAME: (%[[arg:.*]]: memref<8x7x6x5x4xf32> + // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<8x7x6x5x4xf32, 3>) + gpu.func @memref5d(%arg0: memref<8x7x6x5x4xf32> {gpu.test_promote_workgroup}) kernel { + // Verify that loop bounds are emitted, the order does not matter. + // CHECK-DAG: %[[c0:.*]] = constant 0 + // CHECK-DAG: %[[c1:.*]] = constant 1 + // CHECK-DAG: %[[c4:.*]] = constant 4 + // CHECK-DAG: %[[c5:.*]] = constant 5 + // CHECK-DAG: %[[c6:.*]] = constant 6 + // CHECK-DAG: %[[c7:.*]] = constant 7 + // CHECK-DAG: %[[c8:.*]] = constant 8 + // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"} + // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"} + // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"} + // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"} + // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"} + // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"} + + // Verify that loops for the copy are emitted. + // CHECK: loop.for %[[i0:.*]] = + // CHECK: loop.for %[[i1:.*]] = + // CHECK: loop.for %[[i2:.*]] = + // CHECK: loop.for %[[i3:.*]] = + // CHECK: loop.for %[[i4:.*]] = + + // Verify that the copy is emitted. + // CHECK: %[[v:.*]] = load %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]] + // CHECK: store %[[v]], %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]] + + // Verify that the use has been rewritten. + // CHECK: "use"(%[[promoted]]) : (memref<8x7x6x5x4xf32, 3>) + "use"(%arg0) : (memref<8x7x6x5x4xf32>) -> () + + // Verify that loop loops for the copy are emitted. + // CHECK: loop.for %[[i0:.*]] = + // CHECK: loop.for %[[i1:.*]] = + // CHECK: loop.for %[[i2:.*]] = + // CHECK: loop.for %[[i3:.*]] = + // CHECK: loop.for %[[i4:.*]] = + + // Verify that the copy is emitted. + // CHECK: %[[v:.*]] = load %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]] + // CHECK: store %[[v]], %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]] + gpu.return + } } } // ----- -module @foo attributes {gpu.kernel_module} { - // Check that attribution insertion works fine. - // CHECK-LABEL: @insert - // CHECK-SAME: (%{{.*}}: memref<4xf32> - // CHECK-SAME: workgroup(%{{.*}}: memref<1x1xf64, 3> - // CHECK-SAME: %[[wg2:.*]] : memref<4xf32, 3>) - // CHECK-SAME: private(%{{.*}}: memref<1x1xi64, 5>) - gpu.func @insert(%arg0: memref<4xf32> {gpu.test_promote_workgroup}) - workgroup(%arg1: memref<1x1xf64, 3>) - private(%arg2: memref<1x1xi64, 5>) - kernel { - // CHECK: "use"(%[[wg2]]) - "use"(%arg0) : (memref<4xf32>) -> () - gpu.return +module { + gpu.module @foo { + + // Check that attribution insertion works fine. + // CHECK-LABEL: @insert + // CHECK-SAME: (%{{.*}}: memref<4xf32> + // CHECK-SAME: workgroup(%{{.*}}: memref<1x1xf64, 3> + // CHECK-SAME: %[[wg2:.*]] : memref<4xf32, 3>) + // CHECK-SAME: private(%{{.*}}: memref<1x1xi64, 5>) + gpu.func @insert(%arg0: memref<4xf32> {gpu.test_promote_workgroup}) + workgroup(%arg1: memref<1x1xf64, 3>) + private(%arg2: memref<1x1xi64, 5>) + kernel { + // CHECK: "use"(%[[wg2]]) + "use"(%arg0) : (memref<4xf32>) -> () + gpu.return + } } } diff --git a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp --- a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp +++ b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp @@ -25,13 +25,18 @@ /// or beneficial (e.g., makes previously uncoalesced loads coalesced). class TestGpuMemoryPromotionPass : public PassWrapper> { + OperationPass> { void runOnOperation() override { - gpu::GPUFuncOp op = getOperation(); - for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i) { - if (op.getArgAttrOfType(i, "gpu.test_promote_workgroup")) - promoteToWorkgroupMemory(op, i); - } + gpu::GPUModuleOp gpu_module = getOperation(); + gpu_module.walk([](Operation *operation) { + gpu::GPUFuncOp op = llvm::dyn_cast(operation); + if (op) { + for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i) { + if (op.getArgAttrOfType(i, "gpu.test_promote_workgroup")) + promoteToWorkgroupMemory(op, i); + } + } + }); } }; } // end namespace