Index: mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td =================================================================== --- mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -791,6 +791,8 @@ DefaultValuedAttr:$use_full_tile_buffers, UnitAttr:$use_full_tiles_by_default, UnitAttr:$use_alloca, + UnitAttr:$use_gpu_workgroup_memory, + UnitAttr:$use_gpu_private_memory, OptionalAttr:$alignment); let results = (outs PDL_Operation:$transformed); Index: mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp =================================================================== --- mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -11,6 +11,7 @@ #include "mlir/AsmParser/AsmParser.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" @@ -1780,6 +1781,82 @@ //===----------------------------------------------------------------------===// // PromoteOp //===----------------------------------------------------------------------===// +Optional allocateWorkgroupMemory(OpBuilder &builder, + memref::SubViewOp subview, + ArrayRef sizeBounds, + DataLayout &) { + OpBuilder::InsertionGuard guard(builder); + + func::FuncOp funcOp = subview->getParentOfType(); + if (!funcOp) + return std::nullopt; + + // The subview size bounds are expected to be constant; they specify the shape + // of the allocation. + SmallVector shape; + for (Value bound : sizeBounds) { + APInt value; + if (!matchPattern(bound, m_ConstantInt(&value))) + return std::nullopt; + shape.push_back(value.getSExtValue()); + } + + builder.setInsertionPoint(&funcOp.front(), funcOp.front().begin()); + auto type = MemRefType::get( + shape, subview.getType().getElementType(), MemRefLayoutAttrInterface{}, + gpu::AddressSpaceAttr::get(builder.getContext(), + gpu::GPUDialect::getWorkgroupAddressSpace())); + Value buffer = builder.create(funcOp.getLoc(), type); + return buffer; +} + +LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value /*buffer*/) { + return success(); +} + +LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst) { + b.create(src.getLoc()); + Operation *copyOp = b.create(src.getLoc(), src, dst); + b.create(copyOp->getLoc()); + // setMarker(copyOp, getCopyToWorkgroupMemoryMarker()); + return success(); +} + +Optional allocateGPUPrivateMemory(OpBuilder &builder, + memref::SubViewOp subview, + ArrayRef sizeBounds, + DataLayout &) { + OpBuilder::InsertionGuard guard(builder); + + func::FuncOp funcOp = subview->getParentOfType(); + if (!funcOp) + return std::nullopt; + + // The subview size bounds are expected to be constant; they specify the shape + // of the allocation. + SmallVector shape; + for (Value bound : sizeBounds) { + APInt value; + if (!matchPattern(bound, m_ConstantInt(&value))) + return std::nullopt; + shape.push_back(value.getSExtValue()); + } + + builder.setInsertionPoint(&funcOp.front(), funcOp.front().begin()); + auto type = MemRefType::get(shape, subview.getType().getElementType(), + MemRefLayoutAttrInterface{}); + Value buffer = builder.create(funcOp.getLoc(), type); + return buffer; +} + +LogicalResult deallocateGPUPrivateMemory(OpBuilder &, Value /*buffer*/) { + return success(); +} + +LogicalResult copyToGPUPrivateMemory(OpBuilder &b, Value src, Value dst) { + Operation *copyOp = b.create(src.getLoc(), src, dst); + return success(); +} DiagnosedSilenceableFailure transform::PromoteOp::applyToOne(LinalgOp target, @@ -1800,6 +1877,22 @@ if (getAlignment().has_value()) promotionOptions = promotionOptions.setAlignment(*getAlignment()); + if (getUseGpuWorkgroupMemory()) + promotionOptions = + promotionOptions + .setAllocationDeallocationFns(allocateWorkgroupMemory, + deallocateWorkgroupMemory) + .setCopyInOutFns(copyToWorkgroupMemory, copyToWorkgroupMemory) + .setUseFullTileBuffers({false, false}); + + if (getUseGpuPrivateMemory()) + promotionOptions = + promotionOptions + .setAllocationDeallocationFns(allocateGPUPrivateMemory, + deallocateGPUPrivateMemory) + .setCopyInOutFns(copyToGPUPrivateMemory, copyToGPUPrivateMemory) + .setUseFullTileBuffers({false, false}); + if (failed(promoteSubviewsPrecondition(target, promotionOptions))) return emitDefaultDefiniteFailure(target); Index: mlir/test/Dialect/Linalg/promote.mlir =================================================================== --- mlir/test/Dialect/Linalg/promote.mlir +++ mlir/test/Dialect/Linalg/promote.mlir @@ -142,6 +142,52 @@ %1 = transform.structured.promote %0 } +// ----- +func.func @gemm(%a : memref, %b : memref, %c : memref) +{ + linalg.matmul ins(%a, %b: memref, memref) + outs(%c: memref) + return +} + +// CHECK: func @gemm +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: memref +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: memref +// CHECK: %[[alloc_A:.*]] = memref.alloc() : memref<16x16xf32, #gpu.address_space> +// CHECK: %[[alloc_B:.*]] = memref.alloc() : memref<16x16xf32, #gpu.address_space> +// CHECK-DAG: %[[C16:.*]] = arith.constant 16 +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 +// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} { +// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} { +// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} { +// CHECK: %[[subview_A:.*]] = memref.subview {{.*}} : memref to memref> +// CHECK: %[[subview_B:.*]] = memref.subview {{.*}} : memref to memref> +// CHECK: %[[subview_C:.*]] = memref.subview {{.*}} : memref to memref> + +// CHECK: %[[shared_A:.*]] = memref.subview %[[alloc_B]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<16x16xf32, #gpu.address_space> to memref, #gpu.address_space> +// CHECK: %[[shared_B:.*]] = memref.subview %[[alloc_A]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<16x16xf32, #gpu.address_space> to memref, #gpu.address_space> + +// CHECK-NEXT: gpu.barrier +// CHECK-NEXT: memref.copy %[[subview_A]], %[[shared_A]] : memref> to memref, #gpu.address_space> +// CHECK-NEXT: gpu.barrier + +// CHECK-NEXT: gpu.barrier +// CHECK-NEXT: memref.copy %[[subview_B]], %[[shared_B]] : memref> to memref, #gpu.address_space> +// CHECK-NEXT: gpu.barrier + +// CHECK: linalg.matmul ins(%[[shared_A]], %[[shared_B]]{{.*}} outs(%[[subview_C]] + + +transform.sequence failures(propagate) { +^bb0(%arg1: !pdl.operation): + %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation + %1, %loops:3 = transform.structured.tile %0 [16, 16, 16] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation) + %2 = transform.structured.promote %1 { operands_to_promote = [0, 1], use_gpu_workgroup_memory } +} + + // ----- #map6 = affine_map<(d0, d1, d2) -> (d0, d2)>