Index: mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td =================================================================== --- mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td +++ mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td @@ -85,4 +85,23 @@ }]; } + +def GPUMemorySpaceMappingAttr : GPU_Attr<"GPUMemorySpaceMapping", "memory_space", [ + DeclareAttrInterfaceMethods ] > { + let parameters = (ins + EnumParameter:$address_space + ); + let assemblyFormat = "`<` params `>`"; + let description = [{ + An attribute that allows defining memory hierarchy for GPU devices. + + GPU Memory has three memory space, global, workgroup, and private. The global memory + is visible to all workitems and workgroups, the workgroup memory is only available for workitems + within a workgroup, and private memory is only visible to a single workitem. This attribute indicates + that using memory hiearchy is desired. It can be consumed by lowering to + move data to a specific address space in GPU code. + }]; +} + + #endif // GPU_DEVICE_MAPPING_ATTR Index: mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td =================================================================== --- mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -765,6 +765,7 @@ // PromoteOp //===----------------------------------------------------------------------===// + def PromoteOp : Op { @@ -791,6 +792,7 @@ DefaultValuedAttr:$use_full_tile_buffers, UnitAttr:$use_full_tiles_by_default, UnitAttr:$use_alloca, + OptionalAttr:$mapping, OptionalAttr:$alignment); let results = (outs PDL_Operation:$transformed); Index: mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h =================================================================== --- mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -393,6 +393,32 @@ FailureOr promoteSubViews(OpBuilder &b, LinalgOp op, const LinalgPromotionOptions &options); +/// Allocate the subview in the GPU workgroup memory. +Optional allocateWorkgroupMemory(OpBuilder &builder, + memref::SubViewOp subview, + ArrayRef sizeBounds, + DataLayout &); + +/// In case of GPU group memory there is no need to deallocate. +LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value /*buffer*/); + +/// Create Memref copy operations and add gpu barrier guards before and after +/// the copy operation to ensure data integrity. +LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst); + +/// Allocate the subview in the GPU private memory. +Optional allocateGPUPrivateMemory(OpBuilder &builder, + memref::SubViewOp subview, + ArrayRef sizeBounds, + DataLayout &); + +/// Normal copy to between src and dst. +LogicalResult copyToGPUPrivateMemory(OpBuilder &b, Value src, Value dst); + +/// In case of GPU private memory there is no need to deallocate since the +/// memory is freed when going outside of the scope. +LogicalResult deallocateGPUPrivateMemory(OpBuilder &, Value /*buffer*/); + /// Emit a suitable vector form for a Linalg op. If provided, `inputVectorSizes` /// are used to vectorize this operation. `inputVectorSizes` must match the rank /// of the iteration space of the operation and the sizes must be smaller or Index: mlir/lib/Dialect/GPU/IR/GPUDialect.cpp =================================================================== --- mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -50,6 +50,10 @@ return static_cast(getThread()); } +int64_t GPUMemorySpaceMappingAttr::getMappingId() const { + return static_cast(getAddressSpace()); +} + //===----------------------------------------------------------------------===// // MMAMatrixType //===----------------------------------------------------------------------===// Index: mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp =================================================================== --- mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -1800,6 +1800,35 @@ if (getAlignment().has_value()) promotionOptions = promotionOptions.setAlignment(*getAlignment()); + if (getMapping().has_value()) { + // The mapping should only contain an element + auto mapping = *getMapping(); + if (mapping.size() > 1) + return emitDefaultDefiniteFailure(target); + + auto addressSpace = mapping[0].cast(); + + if (addressSpace.getAddressSpace() == + gpu::GPUDialect::getWorkgroupAddressSpace()) { + promotionOptions = + promotionOptions + .setAllocationDeallocationFns(allocateWorkgroupMemory, + deallocateWorkgroupMemory) + .setCopyInOutFns(copyToWorkgroupMemory, copyToWorkgroupMemory) + .setUseFullTileBuffers({false, false}); + } else if (addressSpace.getAddressSpace() == + gpu::GPUDialect::getPrivateAddressSpace()) { + promotionOptions = + promotionOptions + .setAllocationDeallocationFns(allocateGPUPrivateMemory, + deallocateGPUPrivateMemory) + .setCopyInOutFns(copyToGPUPrivateMemory, copyToGPUPrivateMemory) + .setUseFullTileBuffers({false, false}); + } else { + return emitDefaultDefiniteFailure(target); + } + } + if (failed(promoteSubviewsPrecondition(target, promotionOptions))) return emitDefaultDefiniteFailure(target); Index: mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp =================================================================== --- mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp +++ mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp @@ -13,6 +13,8 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Complex/IR/Complex.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Passes.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" @@ -397,3 +399,87 @@ return failure(); return res; } + +/// Allocate the given subview to a memory address space in GPU by creating a +/// allocation operation and setting the memref type address space to desired +/// address space. +static Optional allocateSubviewGPUMemoryInAddressSpace( + OpBuilder &builder, memref::SubViewOp subview, ArrayRef sizeBounds, + gpu::AddressSpace addressSpace) { + OpBuilder::InsertionGuard guard(builder); + + func::FuncOp funcOp = subview->getParentOfType(); + if (!funcOp) + return std::nullopt; + + // The subview size bounds are expected to be constant; they specify the shape + // of the allocation. + SmallVector shape; + for (Value bound : sizeBounds) { + APInt value; + if (!matchPattern(bound, m_ConstantInt(&value))) + return std::nullopt; + shape.push_back(value.getSExtValue()); + } + + builder.setInsertionPoint(&funcOp.front(), funcOp.front().begin()); + auto type = MemRefType::get( + shape, subview.getType().getElementType(), MemRefLayoutAttrInterface{}, + gpu::AddressSpaceAttr::get(builder.getContext(), addressSpace)); + Value buffer; + if (addressSpace == gpu::GPUDialect::getWorkgroupAddressSpace()) { + buffer = builder.create(funcOp.getLoc(), type); + } else if (addressSpace == gpu::GPUDialect::getPrivateAddressSpace()) { + buffer = builder.create(funcOp.getLoc(), type); + } else { + return std::nullopt; + } + return buffer; +} + +/// Allocate the subview in the GPU workgroup memory. +Optional mlir::linalg::allocateWorkgroupMemory( + OpBuilder &builder, memref::SubViewOp subview, ArrayRef sizeBounds, + DataLayout &) { + return allocateSubviewGPUMemoryInAddressSpace( + builder, subview, sizeBounds, + gpu::GPUDialect::getWorkgroupAddressSpace()); +} + +/// In case of GPU group memory there is no need to deallocate. +LogicalResult mlir::linalg::deallocateWorkgroupMemory(OpBuilder &, + Value /*buffer*/) { + return success(); +} + +/// Create Memref copy operations and add gpu barrier guards before and after +/// the copy operation to ensure data integrity. +LogicalResult mlir::linalg::copyToWorkgroupMemory(OpBuilder &b, Value src, + Value dst) { + b.create(src.getLoc()); + Operation *copyOp = b.create(src.getLoc(), src, dst); + b.create(copyOp->getLoc()); + return success(); +} + +/// Allocate the subview in the GPU private memory. +Optional mlir::linalg::allocateGPUPrivateMemory( + OpBuilder &builder, memref::SubViewOp subview, ArrayRef sizeBounds, + DataLayout &) { + return allocateSubviewGPUMemoryInAddressSpace( + builder, subview, sizeBounds, gpu::GPUDialect::getPrivateAddressSpace()); +} + +/// Normal copy to between src and dst. +LogicalResult mlir::linalg::copyToGPUPrivateMemory(OpBuilder &b, Value src, + Value dst) { + Operation *copyOp = b.create(src.getLoc(), src, dst); + return success(); +} + +/// In case of GPU private memory there is no need to deallocate since the +/// memory is freed when going outside of the scope. +LogicalResult mlir::linalg::deallocateGPUPrivateMemory(OpBuilder &, + Value /*buffer*/) { + return success(); +} \ No newline at end of file Index: mlir/test/Dialect/Linalg/promote.mlir =================================================================== --- mlir/test/Dialect/Linalg/promote.mlir +++ mlir/test/Dialect/Linalg/promote.mlir @@ -142,6 +142,94 @@ %1 = transform.structured.promote %0 } +// ----- +func.func @gemm_shared(%a : memref, %b : memref, %c : memref) +{ + linalg.matmul ins(%a, %b: memref, memref) + outs(%c: memref) + return +} + +// CHECK: func @gemm_shared +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: memref +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: memref +// CHECK: %[[alloc_A:.*]] = memref.alloc() : memref<16x16xf32, #gpu.address_space> +// CHECK: %[[alloc_B:.*]] = memref.alloc() : memref<16x16xf32, #gpu.address_space> +// CHECK-DAG: %[[C16:.*]] = arith.constant 16 +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 +// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} { +// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} { +// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} { +// CHECK: %[[subview_A:.*]] = memref.subview {{.*}} : memref to memref> +// CHECK: %[[subview_B:.*]] = memref.subview {{.*}} : memref to memref> +// CHECK: %[[subview_C:.*]] = memref.subview {{.*}} : memref to memref> + +// CHECK: %[[shared_A:.*]] = memref.subview %[[alloc_B]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<16x16xf32, #gpu.address_space> to memref, #gpu.address_space> +// CHECK: %[[shared_B:.*]] = memref.subview %[[alloc_A]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<16x16xf32, #gpu.address_space> to memref, #gpu.address_space> + +// CHECK-NEXT: gpu.barrier +// CHECK-NEXT: memref.copy %[[subview_A]], %[[shared_A]] : memref> to memref, #gpu.address_space> +// CHECK-NEXT: gpu.barrier + +// CHECK-NEXT: gpu.barrier +// CHECK-NEXT: memref.copy %[[subview_B]], %[[shared_B]] : memref> to memref, #gpu.address_space> +// CHECK-NEXT: gpu.barrier + +// CHECK: linalg.matmul ins(%[[shared_A]], %[[shared_B]]{{.*}} outs(%[[subview_C]] + + +transform.sequence failures(propagate) { +^bb0(%arg1: !pdl.operation): + %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation + %1, %loops:3 = transform.structured.tile %0 [16, 16, 16] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation) + %2 = transform.structured.promote %1 { operands_to_promote = [0, 1], mapping = [#gpu.memory_space] } +} + + +// ----- + +func.func @gemm_private(%a : memref, %b : memref, %c : memref) +{ + linalg.matmul ins(%a, %b: memref, memref) + outs(%c: memref) + return +} + +// CHECK: func @gemm_private +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: memref +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: memref +// CHECK: %[[alloc_A:.*]] = memref.alloca() : memref<16x16xf32, #gpu.address_space> +// CHECK: %[[alloc_B:.*]] = memref.alloca() : memref<16x16xf32, #gpu.address_space> +// CHECK-DAG: %[[C16:.*]] = arith.constant 16 +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 +// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} { +// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} { +// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} { +// CHECK: %[[subview_A:.*]] = memref.subview {{.*}} : memref to memref> +// CHECK: %[[subview_B:.*]] = memref.subview {{.*}} : memref to memref> +// CHECK: %[[subview_C:.*]] = memref.subview {{.*}} : memref to memref> + +// CHECK: %[[private_A:.*]] = memref.subview %[[alloc_B]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<16x16xf32, #gpu.address_space> to memref, #gpu.address_space> +// CHECK: %[[private_B:.*]] = memref.subview %[[alloc_A]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<16x16xf32, #gpu.address_space> to memref, #gpu.address_space> + +// CHECK-NEXT: memref.copy %[[subview_A]], %[[private_A]] : memref> to memref, #gpu.address_space> +// CHECK-NEXT: memref.copy %[[subview_B]], %[[private_B]] : memref> to memref, #gpu.address_space> + +// CHECK: linalg.matmul ins(%[[private_A]], %[[private_B]]{{.*}} outs(%[[subview_C]] + + +transform.sequence failures(propagate) { +^bb0(%arg1: !pdl.operation): + %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation + %1, %loops:3 = transform.structured.tile %0 [16, 16, 16] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation) + %2 = transform.structured.promote %1 { operands_to_promote = [0, 1], mapping = [#gpu.memory_space] } +} + + // ----- #map6 = affine_map<(d0, d1, d2) -> (d0, d2)>