diff --git a/mlir/include/mlir/Dialect/NVGPU/CMakeLists.txt b/mlir/include/mlir/Dialect/NVGPU/CMakeLists.txt --- a/mlir/include/mlir/Dialect/NVGPU/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/NVGPU/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(IR) +add_subdirectory(TransformOps) set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls -name NVGPU) diff --git a/mlir/include/mlir/Dialect/NVGPU/TransformOps/CMakeLists.txt b/mlir/include/mlir/Dialect/NVGPU/TransformOps/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/NVGPU/TransformOps/CMakeLists.txt @@ -0,0 +1,4 @@ +set(LLVM_TARGET_DEFINITIONS NVGPUTransformOps.td) +mlir_tablegen(NVGPUTransformOps.h.inc -gen-op-decls) +mlir_tablegen(NVGPUTransformOps.cpp.inc -gen-op-defs) +add_public_tablegen_target(MLIRNVGPUTransformOpsIncGen) diff --git a/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h b/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h @@ -0,0 +1,43 @@ +//===- NVGPUTransformOps.h - NVGPU transform ops ----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_NVGPU_TRANSFORMOPS_NVGPUTRANSFORMOPS_H +#define MLIR_DIALECT_NVGPU_TRANSFORMOPS_NVGPUTRANSFORMOPS_H + +#include "mlir/Dialect/Transform/IR/TransformAttrs.h" +#include "mlir/Dialect/Transform/IR/TransformDialect.h" +#include "mlir/Dialect/Transform/IR/TransformInterfaces.h" +#include "mlir/IR/OpImplementation.h" +#include "mlir/IR/RegionKindInterface.h" + +namespace mlir { +namespace transform { +class TransformHandleTypeInterface; +} // namespace transform +} // namespace mlir + +namespace mlir { +class DialectRegistry; + +namespace linalg { +class LinalgOp; +} // namespace linalg + +namespace nvgpu { +void registerTransformDialectExtension(DialectRegistry ®istry); +} // namespace nvgpu +} // namespace mlir + +//===----------------------------------------------------------------------===// +// NVGPU Transform Operations +//===----------------------------------------------------------------------===// + +#define GET_OP_CLASSES +#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h.inc" + +#endif // MLIR_DIALECT_NVGPU_TRANSFORMOPS_NVGPUTRANSFORMOPS_H diff --git a/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td b/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td @@ -0,0 +1,51 @@ +//===- NVGPUTransformOps.td - NVGPU transform ops ----------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef NVGPU_TRANSFORM_OPS +#define NVGPU_TRANSFORM_OPS + +include "mlir/Dialect/Transform/IR/TransformAttrs.td" +include "mlir/Dialect/Transform/IR/TransformDialect.td" +include "mlir/Dialect/Transform/IR/TransformInterfaces.td" +include "mlir/Dialect/Transform/IR/TransformTypes.td" +include "mlir/Interfaces/SideEffectInterfaces.td" + +//===----------------------------------------------------------------------===// +// RewriteMatmulAsMmaSyncOp +//===----------------------------------------------------------------------===// + +def RewriteMatmulAsMmaSyncOp : + Op { + let description = [{ + Rewrite a matmul operation on memref to an mma.sync operation on vectors. + + Memory copies with the required access patterns are automatically inserted. + Operations that do not have a 1-1 mapping to mma.sync operations are left + unchanged. + }]; + + let arguments = (ins TransformHandleTypeInterface:$target); + let results = (outs); + + let assemblyFormat = "$target attr-dict `:` functional-type(operands, results) "; + + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::transform::TransformRewriter &rewriter, + ::mlir::linalg::LinalgOp linalgOp, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state); + }]; +} + +#endif // NVGPU_TRANSFORM_OPS diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h --- a/mlir/include/mlir/InitAllDialects.h +++ b/mlir/include/mlir/InitAllDialects.h @@ -55,6 +55,7 @@ #include "mlir/Dialect/MemRef/Transforms/BufferizableOpInterfaceImpl.h" #include "mlir/Dialect/MemRef/Transforms/RuntimeOpVerification.h" #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" +#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h" #include "mlir/Dialect/OpenACC/OpenACC.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/Dialect/PDL/IR/PDL.h" @@ -137,6 +138,7 @@ gpu::registerTransformDialectExtension(registry); linalg::registerTransformDialectExtension(registry); memref::registerTransformDialectExtension(registry); + nvgpu::registerTransformDialectExtension(registry); scf::registerTransformDialectExtension(registry); tensor::registerTransformDialectExtension(registry); transform::registerPDLExtension(registry); diff --git a/mlir/lib/Dialect/NVGPU/CMakeLists.txt b/mlir/lib/Dialect/NVGPU/CMakeLists.txt --- a/mlir/lib/Dialect/NVGPU/CMakeLists.txt +++ b/mlir/lib/Dialect/NVGPU/CMakeLists.txt @@ -1,3 +1,4 @@ add_subdirectory(IR) add_subdirectory(Utils) +add_subdirectory(TransformOps) add_subdirectory(Transforms) diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/NVGPU/TransformOps/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/NVGPU/TransformOps/CMakeLists.txt @@ -0,0 +1,21 @@ +add_mlir_dialect_library(MLIRNVGPUTransformOps + NVGPUTransformOps.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/NVGPU/TransformOps + + DEPENDS + MLIRNVGPUTransformOpsIncGen + + LINK_LIBS PUBLIC + MLIRAffineDialect + MLIRArithDialect + MLIRIR + MLIRLinalgDialect + MLIRNVGPUDialect + MLIRParser + MLIRSideEffectInterfaces + MLIRTransformDialect + MLIRTransformDialectUtils + MLIRVectorTransforms + ) diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp @@ -0,0 +1,488 @@ +//===- NVGPUTransformOps.cpp - Implementation of NVGPU transform ops ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h" + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Arith/Utils/Utils.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" +#include "mlir/Dialect/Utils/StaticValueUtils.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/AffineExpr.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/TypeRange.h" +#include "mlir/IR/TypeUtilities.h" +#include "mlir/Support/LogicalResult.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace mlir; +using namespace mlir::linalg; +using namespace mlir::nvgpu; +using namespace mlir::transform; + +#define DEBUG_TYPE "nvgpu-transforms" +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") +#define DBGSNL() (llvm::dbgs() << "\n") +#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") + +//===----------------------------------------------------------------------===// +// RewriteMatmulAsMmaSyncOp +//===----------------------------------------------------------------------===// + +/// Helper struct to encode a pair of row/column indexings in the form of +/// affine expressions. +struct RowColIndexing : private std::pair { + RowColIndexing(AffineExpr row, AffineExpr col) + : std::pair(row, col) {} + + AffineExpr row() const { return first; }; + AffineExpr col() const { return second; }; + + void print(llvm::raw_ostream &os) const { + os << "- indexing: " << first << ", " << second; + } +}; + +/// Helper struct to provide a simple mapping from matmul operations to the +/// corresponding mma.sync operation. This is constrained to the case where the +/// matmul matches the mma.sync operation 1-1. +struct MmaSyncBuilder { + MmaSyncBuilder(OpBuilder &b, Location loc, OpFoldResult laneId) + : b(b), loc(loc), laneId(laneId) {} + + using IndexCalculator = + std::function(MLIRContext *)>; + + /// Create the mma.sync operation corresponding to `linalgOp` along with all + /// the supporting load/store and vector operations. + FailureOr buildMmaSync(LinalgOp linalgOp); + +private: + struct MmaSyncInfo { + std::tuple indexFns; + std::tuple, SmallVector, SmallVector> + vectorShapes; + SmallVector mmaShape; + bool tf32Enabled; + }; + + /// Return the specific index calculator for the given `linalgOp` or failure + /// if the op is not supported. This is the toplevel switch that should just + /// be Tablegen'd in the future. + FailureOr getIndexCalculators(ArrayRef opShape, + TypeRange elementalTypes); + + //===--------------------------------------------------------------------===// + // Instruction-specific row, column indexing expression builders. + // These should all be declaratively specified via Tablegen in the future. + // The Tablegen specification should be as straightforward as possible to + // only model the existing size and type combinations. + //===--------------------------------------------------------------------===// + // + // TODO: Tablegen all this. + //===--------------------------------------------------------------------===// + // m16n8k4 tf32 case. + //===--------------------------------------------------------------------===// + /// From the NVIDIA doc: + /// groupID = %laneid >> 2 + /// threadIDInGroup = %laneid % 4 + /// row = groupID for a0 + /// groupID + 8 for a1 + /// col = threadIDInGroup + static SmallVector m16n8k4tf32Lhs(MLIRContext *ctx) { + auto dim = getAffineDimExpr(0, ctx); + AffineExpr groupID = dim.floorDiv(4); + AffineExpr threadIDInGroup = dim % 4; + return {RowColIndexing{groupID, threadIDInGroup}, + RowColIndexing{groupID + 8, threadIDInGroup}}; + } + + /// From the NVIDIA doc: + /// groupID = %laneid >> 2 + /// threadIDInGroup = %laneid % 4 + /// row = threadIDInGroup + /// col = groupID + static SmallVector m16n8k4tf32Rhs(MLIRContext *ctx) { + auto dim = getAffineDimExpr(0, ctx); + AffineExpr groupID = dim.floorDiv(4); + AffineExpr threadIDInGroup = dim % 4; + return {RowColIndexing{threadIDInGroup, groupID}}; + } + + /// From the NVIDIA doc: + /// groupID = %laneid >> 2 + /// threadIDInGroup = %laneid % 4 + /// row = groupID for c0 and c1 + /// groupID + 8 for c2 and c3 + /// col = (threadIDInGroup * 2) + (i & 0x1) for ci where i = {0,..,3} + static SmallVector m16n8k4tf32Res(MLIRContext *ctx) { + auto dim = getAffineDimExpr(0, ctx); + AffineExpr groupID = dim.floorDiv(4); + AffineExpr threadIDInGroup = dim % 4; + return {RowColIndexing{groupID, threadIDInGroup * 2 + 0}, + RowColIndexing{groupID, threadIDInGroup * 2 + 1}, + RowColIndexing{groupID + 8, threadIDInGroup * 2 + 0}, + RowColIndexing{groupID + 8, threadIDInGroup * 2 + 1}}; + } + + //===--------------------------------------------------------------------===// + // m16n8k16 f16 case. + //===--------------------------------------------------------------------===// + /// From the NVIDIA doc: + /// groupID = %laneid >> 2 + /// threadIDInGroup = %laneid % 4 + /// + /// row = groupID for ai where 0 <= i < 2 || 4 <= i < 6 + /// groupID + 8 Otherwise + /// + /// col = (threadIDInGroup * 2) + (i & 0x1) for ai where i < 4 + /// (threadIDInGroup * 2) + (i & 0x1) + 8 for ai where i >= 4 + static SmallVector m16n8k16f16Lhs(MLIRContext *ctx) { + auto dim = getAffineDimExpr(0, ctx); + AffineExpr groupID = dim.floorDiv(4); + AffineExpr threadIDInGroup = dim % 4; + // clang-format off + return { + RowColIndexing{groupID, threadIDInGroup * 2 + 0}, // i == 0 + RowColIndexing{groupID, threadIDInGroup * 2 + 1}, // i == 1 + RowColIndexing{groupID + 8, threadIDInGroup * 2 + 0}, // i == 2 + RowColIndexing{groupID + 8, threadIDInGroup * 2 + 1}, // i == 3 + RowColIndexing{groupID, threadIDInGroup * 2 + 0 + 8}, // i == 4 + RowColIndexing{groupID, threadIDInGroup * 2 + 1 + 8}, // i == 5 + RowColIndexing{groupID + 8, threadIDInGroup * 2 + 0 + 8}, // i == 6 + RowColIndexing{groupID + 8, threadIDInGroup * 2 + 1 + 8} // i == 7 + }; + // clang-format on + } + + /// From the NVIDIA doc: + /// groupID = %laneid >> 2 + /// threadIDInGroup = %laneid % 4 + /// + /// row = (threadIDInGroup * 2) + (i & 0x1) for bi where i < 2 + /// (threadIDInGroup * 2) + (i & 0x1) + 8 for bi where i >= 2 + /// + /// col = groupID + static SmallVector m16n8k16f16Rhs(MLIRContext *ctx) { + auto dim = getAffineDimExpr(0, ctx); + AffineExpr groupID = dim.floorDiv(4); + AffineExpr threadIDInGroup = dim % 4; + // clang-format off + return { + RowColIndexing{threadIDInGroup * 2 + 0, groupID}, // i == 0 + RowColIndexing{threadIDInGroup * 2 + 1, groupID}, // i == 1 + RowColIndexing{threadIDInGroup * 2 + 0 + 8, groupID}, // i == 2 + RowColIndexing{threadIDInGroup * 2 + 1 + 8, groupID} // i == 3 + }; + // clang-format on + } + + /// From the NVIDIA doc: + /// groupID = %laneid >> 2 + /// threadIDInGroup = %laneid % 4 + /// + /// row = groupID for ci where i < 2 + /// groupID + 8 for ci where i >= 2 + /// + /// col = (threadIDInGroup * 2) + (i & 0x1) for ci where i = {0,..,3} + static SmallVector m16n8k16f16Res(MLIRContext *ctx) { + auto dim = getAffineDimExpr(0, ctx); + AffineExpr groupID = dim.floorDiv(4); + AffineExpr threadIDInGroup = dim % 4; + // clang-format off + return { + RowColIndexing{groupID, threadIDInGroup * 2 + 0}, // i == 0 + RowColIndexing{groupID, threadIDInGroup * 2 + 1}, // i == 1 + RowColIndexing{groupID + 8, threadIDInGroup * 2 + 0}, // i == 2 + RowColIndexing{groupID + 8, threadIDInGroup * 2 + 1} // i == 3 + }; + // clang-format on + } + + //===--------------------------------------------------------------------===// + /// Helper functions to create customizable load and stores operations. The + /// specific shapes of each MMA instruction are passed via the + /// IndexCalculator callback. + //===--------------------------------------------------------------------===// + /// Build a list of memref.load operations indexed at `(row, col)` indices + /// that make sense for a particular MMA instruction and specified via the + /// IndexCalculator callback. + SmallVector buildMemrefLoads(OpBuilder &b, Location loc, + OpFoldResult laneId, Value memref, + IndexCalculator indexFn); + + /// Perform a distributed load of a vector operand of `vectorShape` for a + /// particular MMA instruction whose `(row, col)` indices are specified via + /// the IndexCalculator callback. Each `laneId` loads the subportion of the + /// data that makes sense for the particular MMA operation. + /// The `vectorShape` matches existing NVGPU dialect op specification but + /// could also be flattened in the future if needed for simplification. + Value buildMmaSyncMemrefLoadOperand(OpBuilder &b, Location loc, + OpFoldResult laneId, Value memref, + IndexCalculator indexFn, + ArrayRef vectorShape); + + /// Build a list of memref.store operations indexed at `(row, col)` indices + /// that make sense for a particular MMA instruction and specified via the + /// IndexCalculator callback. + SmallVector buildMemrefStores(OpBuilder &b, Location loc, + ValueRange toStore, + OpFoldResult laneId, Value memref, + IndexCalculator indexFn); + + /// Perform a distributed store of a vector operand of `vectorShape` for a + /// particular MMA instruction whose `(row, col)` indices are specified via + /// the IndexCalculator callback. Each `laneId` loads the subportion of the + /// data that makes sense for the particular MMA operation. + /// The `vectorShape` matches existing NVGPU dialect op specification but + /// could also be flattened in the future if needed for simplification. + SmallVector buildMmaSyncMemrefStoreOperand( + OpBuilder &b, Location loc, Value vectorToStore, OpFoldResult laneId, + Value memref, IndexCalculator indexFn, ArrayRef vectorShape); + + OpBuilder &b; + Location loc; + OpFoldResult laneId; +}; + +//===--------------------------------------------------------------------===// +/// Helper functions to create customizable load and stores operations. The +/// specific shapes of each MMA instruction are passed via the +/// IndexCalculator callback. +//===--------------------------------------------------------------------===// + +template +static void foreachIndividualVectorElement(Value vector, ApplyFn applyFn, + ReduceFn reduceFn) { + VectorType vectorType = vector.getType().cast(); + auto vectorShape = vectorType.getShape(); + auto strides = computeStrides(vectorShape); + for (int64_t idx = 0, e = vectorShape[0] * strides[0]; idx < e; ++idx) { + auto indices = delinearize(idx, strides); + reduceFn(applyFn(vector, idx, indices), idx, indices); + } +} + +SmallVector MmaSyncBuilder::buildMemrefLoads(OpBuilder &b, Location loc, + OpFoldResult laneId, + Value memref, + IndexCalculator indexFn) { + auto aff = [&](AffineExpr e) { + return affine::makeComposedFoldedAffineApply(b, loc, e, laneId); + }; + SmallVector res; + SmallVector indexings = indexFn(b.getContext()); + for (auto indexing : indexings) { + Value row = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.row())); + Value col = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.col())); + auto load = b.create(loc, memref, ValueRange{row, col}); + res.push_back(load); + } + return res; +} + +Value MmaSyncBuilder::buildMmaSyncMemrefLoadOperand( + OpBuilder &b, Location loc, OpFoldResult laneId, Value memref, + IndexCalculator indexFn, ArrayRef vectorShape) { + auto loads = buildMemrefLoads(b, loc, laneId, memref, indexFn); + + Type elementType = getElementTypeOrSelf(memref.getType()); + auto vt = VectorType::get(vectorShape, elementType); + Value res = b.create(loc, vt, loads[0]); + foreachIndividualVectorElement( + res, + /*applyFn=*/ + [&](Value v, int64_t linearIdx, ArrayRef indices) { + return loads[linearIdx]; + }, + /*reduceFn=*/ + [&](Value v, int64_t linearIdx, ArrayRef indices) { + res = b.create(loc, v, res, indices); + }); + + return res; +} + +SmallVector +MmaSyncBuilder::buildMemrefStores(OpBuilder &b, Location loc, + ValueRange toStore, OpFoldResult laneId, + Value memref, IndexCalculator indexFn) { + auto aff = [&](AffineExpr e) { + return affine::makeComposedFoldedAffineApply(b, loc, e, laneId); + }; + SmallVector res; + for (auto [indexing, val] : + llvm::zip_equal(indexFn(b.getContext()), toStore)) { + Value row = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.row())); + Value col = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.col())); + Operation *store = + b.create(loc, val, memref, ValueRange{row, col}); + res.push_back(store); + } + return res; +} + +SmallVector MmaSyncBuilder::buildMmaSyncMemrefStoreOperand( + OpBuilder &b, Location loc, Value vectorToStore, OpFoldResult laneId, + Value memref, IndexCalculator indexFn, ArrayRef vectorShape) { + SmallVector toStore; + toStore.reserve(32); + foreachIndividualVectorElement( + vectorToStore, + /*applyFn=*/ + [&](Value v, int64_t linearIdx, ArrayRef indices) { + return b.create(loc, vectorToStore, indices); + }, + /*reduceFn=*/ + [&](Value v, int64_t linearIdx, ArrayRef indices) { + toStore.push_back(v); + }); + return buildMemrefStores(b, loc, toStore, laneId, memref, indexFn); +} + +static std::tuple, SmallVector, + SmallVector> +makeVectorShapes(ArrayRef lhs, ArrayRef rhs, + ArrayRef res) { + SmallVector vlhs{lhs.begin(), lhs.end()}; + SmallVector vrhs{rhs.begin(), rhs.end()}; + SmallVector vres{res.begin(), res.end()}; + return std::make_tuple(vlhs, vrhs, vres); +} + +FailureOr +MmaSyncBuilder::getIndexCalculators(ArrayRef opShape, + TypeRange elementalTypes) { + // TODO: Tablegen all this. + Type f16 = b.getF16Type(); + Type f32 = b.getF32Type(); + if (opShape == ArrayRef{16, 8, 4} && + elementalTypes == TypeRange{f32, f32, f32}) { + return MmaSyncInfo{std::make_tuple(&MmaSyncBuilder::m16n8k4tf32Lhs, + &MmaSyncBuilder::m16n8k4tf32Rhs, + &MmaSyncBuilder::m16n8k4tf32Res), + makeVectorShapes({2, 1}, {1, 1}, {2, 2}), + SmallVector{opShape.begin(), opShape.end()}, + /*tf32Enabled=*/true}; + } + // This is the version with f16 accumulation. + // TODO: version with f32 accumulation. + if (opShape == ArrayRef{16, 8, 16} && + elementalTypes == TypeRange{f16, f16, f16}) { + return MmaSyncInfo{std::make_tuple(&MmaSyncBuilder::m16n8k16f16Lhs, + &MmaSyncBuilder::m16n8k16f16Rhs, + &MmaSyncBuilder::m16n8k16f16Res), + makeVectorShapes({4, 2}, {2, 2}, {2, 2}), + SmallVector{opShape.begin(), opShape.end()}, + /*tf32Enabled=*/false}; + } + return failure(); +} + +FailureOr MmaSyncBuilder::buildMmaSync(LinalgOp linalgOp) { + Value lhsMemref = linalgOp.getDpsInputOperand(0)->get(); + Value rhsMemref = linalgOp.getDpsInputOperand(1)->get(); + Value resMemref = linalgOp.getDpsInitOperand(0)->get(); + assert(lhsMemref.getType().cast().getRank() == 2 && + "expected lhs to be a 2D memref"); + assert(rhsMemref.getType().cast().getRank() == 2 && + "expected rhs to be a 2D memref"); + assert(resMemref.getType().cast().getRank() == 2 && + "expected res to be a 2D memref"); + + int64_t m = cast(lhsMemref.getType()).getShape()[0]; + int64_t n = cast(rhsMemref.getType()).getShape()[1]; + int64_t k = cast(lhsMemref.getType()).getShape()[1]; + Type lhsType = getElementTypeOrSelf(lhsMemref.getType()); + Type rhsType = getElementTypeOrSelf(rhsMemref.getType()); + Type resType = getElementTypeOrSelf(resMemref.getType()); + + FailureOr maybeInfo = + getIndexCalculators({m, n, k}, {lhsType, rhsType, resType}); + if (failed(maybeInfo)) + return failure(); + + MmaSyncInfo info = *maybeInfo; + auto [lhsIndexFn, rhsIndexFn, resIndexFn] = info.indexFns; + auto [lhsShape, rhsShape, resShape] = info.vectorShapes; + Value lhs = buildMmaSyncMemrefLoadOperand(b, loc, laneId, lhsMemref, + lhsIndexFn, lhsShape); + Value rhs = buildMmaSyncMemrefLoadOperand(b, loc, laneId, rhsMemref, + rhsIndexFn, rhsShape); + Value res = buildMmaSyncMemrefLoadOperand(b, loc, laneId, resMemref, + resIndexFn, resShape); + res = b.create(loc, lhs, rhs, res, info.mmaShape, + info.tf32Enabled); + buildMmaSyncMemrefStoreOperand(b, loc, res, laneId, resMemref, resIndexFn, + resShape); + return res.getDefiningOp(); +} + +DiagnosedSilenceableFailure transform::RewriteMatmulAsMmaSyncOp::applyToOne( + transform::TransformRewriter &rewriter, LinalgOp linalgOp, + transform::ApplyToEachResultList &results, + transform::TransformState &state) { + bool fail = true; + // TODO: more robust detection of matmulOp, with transposes etc. + if (auto matmulOp = isa(linalgOp.getOperation())) { + Location loc = linalgOp.getLoc(); + // TODO: more robust computation of laneId, for now assume a single warp. + Value laneId = rewriter.create( + loc, rewriter.getIndexType(), gpu::Dimension::x); + if (succeeded(MmaSyncBuilder(rewriter, loc, laneId).buildMmaSync(linalgOp))) + fail = false; + } + + if (fail) { + DiagnosedSilenceableFailure diag = emitSilenceableError() + << "unsupported target op: " << linalgOp; + diag.attachNote(linalgOp->getLoc()) << "target op"; + return diag; + } + + rewriter.eraseOp(linalgOp); + return DiagnosedSilenceableFailure::success(); +} + +//===----------------------------------------------------------------------===// +// Transform op registration +//===----------------------------------------------------------------------===// + +namespace { +class NVGPUTransformDialectExtension + : public transform::TransformDialectExtension< + NVGPUTransformDialectExtension> { +public: + NVGPUTransformDialectExtension() { + declareGeneratedDialect(); + declareGeneratedDialect(); + declareGeneratedDialect(); + declareGeneratedDialect(); + registerTransformOps< +#define GET_OP_LIST +#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp.inc" + >(); + } +}; +} // namespace + +#define GET_OP_CLASSES +#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp.inc" + +void mlir::nvgpu::registerTransformDialectExtension(DialectRegistry ®istry) { + registry.addExtensions(); +} diff --git a/mlir/test/Dialect/NVGPU/transform-matmul-to-nvvm.mlir b/mlir/test/Dialect/NVGPU/transform-matmul-to-nvvm.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/NVGPU/transform-matmul-to-nvvm.mlir @@ -0,0 +1,113 @@ +// RUN: mlir-opt %s -test-transform-dialect-interpreter -split-input-file | FileCheck %s + +// CHECK: #[[$div4:.*]] = affine_map<()[s0] -> (s0 floordiv 4)> +// CHECK: #[[$mod4:.*]] = affine_map<()[s0] -> (s0 mod 4)> +// CHECK: #[[$div4p8:.*]] = affine_map<()[s0] -> (s0 floordiv 4 + 8)> +// CHECK: #[[$map3:.*]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)> +// CHECK: #[[$map4:.*]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8 + 1)> + +// CHECK-LABEL: func.func @matmul_16x8x4xf32_global +func.func @matmul_16x8x4xf32_global( + %A: memref<16x4xf32>, %B: memref<4x8xf32>, %C: memref<16x8xf32>) { +// CHECK-SAME: %[[VAL_0:.*]]: memref<16x4xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: memref<4x8xf32>, +// CHECK-SAME: %[[VAL_2:.*]]: memref<16x8xf32>) { + +// CHECK: %[[TIDX:.*]] = gpu.thread_id x +// CHECK: %[[VAL_4:.*]] = affine.apply #[[$div4]]()[%[[TIDX]]] +// CHECK: %[[VAL_5:.*]] = affine.apply #[[$mod4]]()[%[[TIDX]]] +// CHECK: %[[VAL_6:.*]] = memref.load %[[VAL_0]][%[[VAL_4]], %[[VAL_5]]] : memref<16x4xf32> +// CHECK: %[[VAL_7:.*]] = affine.apply #[[$div4p8]]()[%[[TIDX]]] +// CHECK: %[[VAL_8:.*]] = affine.apply #[[$mod4]]()[%[[TIDX]]] +// CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_0]][%[[VAL_7]], %[[VAL_8]]] : memref<16x4xf32> +// CHECK: %[[VAL_10:.*]] = vector.splat %[[VAL_6]] : vector<2x1xf32> +// CHECK: %[[VAL_11:.*]] = vector.insert %[[VAL_6]], %[[VAL_10]] [0, 0] : f32 into vector<2x1xf32> +// CHECK: %[[LHS:.*]] = vector.insert %[[VAL_9]], %[[VAL_11]] [1, 0] : f32 into vector<2x1xf32> +// +// CHECK: %[[VAL_13:.*]] = affine.apply #[[$mod4]]()[%[[TIDX]]] +// CHECK: %[[VAL_14:.*]] = affine.apply #[[$div4]]()[%[[TIDX]]] +// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_1]][%[[VAL_13]], %[[VAL_14]]] : memref<4x8xf32> +// CHECK: %[[VAL_16:.*]] = vector.splat %[[VAL_15]] : vector<1x1xf32> +// CHECK: %[[RHS:.*]] = vector.insert %[[VAL_15]], %[[VAL_16]] [0, 0] : f32 into vector<1x1xf32> +// +// CHECK: %[[VAL_18:.*]] = affine.apply #[[$div4]]()[%[[TIDX]]] +// CHECK: %[[VAL_19:.*]] = affine.apply #[[$map3]]()[%[[TIDX]]] +// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_2]][%[[VAL_18]], %[[VAL_19]]] : memref<16x8xf32> +// CHECK: %[[VAL_21:.*]] = affine.apply #[[$div4]]()[%[[TIDX]]] +// CHECK: %[[VAL_22:.*]] = affine.apply #[[$map4]]()[%[[TIDX]]] +// CHECK: %[[VAL_23:.*]] = memref.load %[[VAL_2]][%[[VAL_21]], %[[VAL_22]]] : memref<16x8xf32> +// CHECK: %[[VAL_24:.*]] = affine.apply #[[$div4p8]]()[%[[TIDX]]] +// CHECK: %[[VAL_25:.*]] = affine.apply #[[$map3]]()[%[[TIDX]]] +// CHECK: %[[VAL_26:.*]] = memref.load %[[VAL_2]][%[[VAL_24]], %[[VAL_25]]] : memref<16x8xf32> +// CHECK: %[[VAL_27:.*]] = affine.apply #[[$div4p8]]()[%[[TIDX]]] +// CHECK: %[[VAL_28:.*]] = affine.apply #[[$map4]]()[%[[TIDX]]] +// CHECK: %[[VAL_29:.*]] = memref.load %[[VAL_2]][%[[VAL_27]], %[[VAL_28]]] : memref<16x8xf32> +// CHECK: %[[VAL_30:.*]] = vector.splat %[[VAL_20]] : vector<2x2xf32> +// CHECK: %[[VAL_31:.*]] = vector.insert %[[VAL_20]], %[[VAL_30]] [0, 0] : f32 into vector<2x2xf32> +// CHECK: %[[VAL_32:.*]] = vector.insert %[[VAL_23]], %[[VAL_31]] [0, 1] : f32 into vector<2x2xf32> +// CHECK: %[[VAL_33:.*]] = vector.insert %[[VAL_26]], %[[VAL_32]] [1, 0] : f32 into vector<2x2xf32> +// CHECK: %[[RES:.*]] = vector.insert %[[VAL_29]], %[[VAL_33]] [1, 1] : f32 into vector<2x2xf32> +// +// CHECK: %[[VAL_35:.*]] = nvgpu.mma.sync(%[[LHS]], %[[RHS]], %[[RES]]) {mmaShape = [16, 8, 4], tf32Enabled} : (vector<2x1xf32>, vector<1x1xf32>, vector<2x2xf32>) -> vector<2x2xf32> +// +// CHECK: %[[VAL_36:.*]] = vector.extract %[[VAL_35]][0, 0] : vector<2x2xf32> +// CHECK: %[[VAL_37:.*]] = vector.extract %[[VAL_35]][0, 1] : vector<2x2xf32> +// CHECK: %[[VAL_38:.*]] = vector.extract %[[VAL_35]][1, 0] : vector<2x2xf32> +// CHECK: %[[VAL_39:.*]] = vector.extract %[[VAL_35]][1, 1] : vector<2x2xf32> +// CHECK: %[[VAL_40:.*]] = affine.apply #[[$div4]]()[%[[TIDX]]] +// CHECK: %[[VAL_41:.*]] = affine.apply #[[$map3]]()[%[[TIDX]]] +// CHECK: memref.store %[[VAL_36]], %[[VAL_2]][%[[VAL_40]], %[[VAL_41]]] : memref<16x8xf32> +// CHECK: %[[VAL_42:.*]] = affine.apply #[[$div4]]()[%[[TIDX]]] +// CHECK: %[[VAL_43:.*]] = affine.apply #[[$map4]]()[%[[TIDX]]] +// CHECK: memref.store %[[VAL_37]], %[[VAL_2]][%[[VAL_42]], %[[VAL_43]]] : memref<16x8xf32> +// CHECK: %[[VAL_44:.*]] = affine.apply #[[$div4p8]]()[%[[TIDX]]] +// CHECK: %[[VAL_45:.*]] = affine.apply #[[$map3]]()[%[[TIDX]]] +// CHECK: memref.store %[[VAL_38]], %[[VAL_2]][%[[VAL_44]], %[[VAL_45]]] : memref<16x8xf32> +// CHECK: %[[VAL_46:.*]] = affine.apply #[[$div4p8]]()[%[[TIDX]]] +// CHECK: %[[VAL_47:.*]] = affine.apply #[[$map4]]()[%[[TIDX]]] +// CHECK: memref.store %[[VAL_39]], %[[VAL_2]][%[[VAL_46]], %[[VAL_47]]] : memref<16x8xf32> +// CHECK: return +// CHECK: } + linalg.matmul ins(%A, %B: memref<16x4xf32>, memref<4x8xf32>) + outs(%C: memref<16x8xf32>) + return +} + +transform.sequence failures(propagate) { +^bb1(%arg1: !transform.any_op): + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + transform.nvgpu.rewrite_matmul_as_mma_sync %matmul + : (!transform.any_op) -> () +} + +// ----- + +// CHECK-LABEL: func.func @matmul_16x8x16xf16_global +func.func @matmul_16x8x16xf16_global( + %A: memref<16x16xf16>, %B: memref<16x8xf16>, %C: memref<16x8xf16>) { + + // CHECK-COUNT-8: memref.load {{.*}} : memref<16x16xf16> + // CHECK-COUNT-8: vector.insert {{.*}} : f16 into vector<4x2xf16> + // CHECK-COUNT-4: memref.load {{.*}} : memref<16x8xf16> + // CHECK-COUNT-4: vector.insert {{.*}} : f16 into vector<2x2xf16> + // CHECK-COUNT-4: memref.load {{.*}} : memref<16x8xf16> + // CHECK-COUNT-4: vector.insert {{.*}} : f16 into vector<2x2xf16> + // + // CHECK: nvgpu.mma.sync(%{{.*}}) {mmaShape = [16, 8, 16]} + // CHECK-SAME: : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> + // + // CHECK-COUNT-4: vector.extract %{{.*}} : vector<2x2xf16> + // CHECK-COUNT-4: memref.store %{{.*}} : memref<16x8xf16> + linalg.matmul ins(%A, %B: memref<16x16xf16>, memref<16x8xf16>) + outs(%C: memref<16x8xf16>) + return +} + +transform.sequence failures(propagate) { +^bb1(%arg1: !transform.any_op): + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + transform.nvgpu.rewrite_matmul_as_mma_sync %matmul + : (!transform.any_op) -> () +} diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/lit.local.cfg b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/lit.local.cfg new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/lit.local.cfg @@ -0,0 +1,2 @@ +if not config.enable_cuda_runner or not config.mlir_run_cuda_sm80_tests: + config.unsupported = True diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir @@ -0,0 +1,239 @@ +// RUN: mlir-opt %s \ +// RUN: -test-transform-dialect-interpreter \ +// RUN: -test-transform-dialect-erase-schedule \ +// RUN: -gpu-kernel-outlining \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-vector-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -expand-strided-metadata \ +// RUN: -lower-affine \ +// RUN: -convert-index-to-llvm=index-bitwidth=32 \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-func-to-llvm \ +// RUN: -canonicalize \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm{use-opaque-pointers=1},lower-affine,convert-scf-to-cf,convert-vector-to-llvm,convert-math-to-llvm,expand-strided-metadata,lower-affine,convert-index-to-llvm{index-bitwidth=32},convert-arith-to-llvm,reconcile-unrealized-casts,gpu-to-cubin{chip=sm_80 features=+ptx76}))' \ +// RUN: | mlir-opt -convert-index-to-llvm=index-bitwidth=32 \ +// RUN: -gpu-to-llvm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_cuda_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +!lhs_memref_type = memref<16x16xf16> +!rhs_memref_type = memref<16x8xf16> +!res_memref_type = memref<16x8xf16> + +func.func @compute_linspace_val(%ridx: index, %cidx: index, %strideCidx: index) -> f16 { + %r = arith.index_cast %ridx : index to i32 + %c = arith.index_cast %cidx : index to i32 + %strideC = arith.index_cast %strideCidx : index to i32 + %2 = arith.muli %r, %strideC : i32 + %3 = arith.addi %c, %2 : i32 + %4 = arith.sitofp %3 : i32 to f16 + %factor = arith.constant 64.0 : f16 + %5 = arith.divf %4, %factor : f16 + return %5: f16 +} + +func.func @print_lhs_as_memref_32(%lhs: !lhs_memref_type) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %M = memref.dim %lhs, %c0 : !lhs_memref_type + %N = memref.dim %lhs, %c1 : !lhs_memref_type + %tmp_alloc = memref.alloc(%M, %N) : memref + scf.for %m = %c0 to %M step %c1 { + scf.for %n = %c0 to %N step %c1 { + %f16 = memref.load %lhs[%m, %n] : !lhs_memref_type + %f32 = arith.extf %f16 : f16 to f32 + memref.store %f32, %tmp_alloc[%m, %n] : memref + } + } + %casted = memref.cast %tmp_alloc : memref to memref<*xf32> + call @printMemrefF32(%casted) : (memref<*xf32>) -> () + memref.dealloc %tmp_alloc : memref + return +} + +func.func @print_rhs_as_memref_32(%rhs: !rhs_memref_type) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %M = memref.dim %rhs, %c0 : !rhs_memref_type + %N = memref.dim %rhs, %c1 : !rhs_memref_type + %tmp_alloc = memref.alloc(%M, %N) : memref + scf.for %m = %c0 to %M step %c1 { + scf.for %n = %c0 to %N step %c1 { + %f16 = memref.load %rhs[%m, %n] : !rhs_memref_type + %f32 = arith.extf %f16 : f16 to f32 + memref.store %f32, %tmp_alloc[%m, %n] : memref + } + } + %casted = memref.cast %tmp_alloc : memref to memref<*xf32> + call @printMemrefF32(%casted) : (memref<*xf32>) -> () + memref.dealloc %tmp_alloc : memref + return +} + +func.func @print_res_as_memref_32(%res: !res_memref_type) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %M = memref.dim %res, %c0 : !res_memref_type + %N = memref.dim %res, %c1 : !res_memref_type + %tmp_alloc = memref.alloc(%M, %N) : memref + scf.for %m = %c0 to %M step %c1 { + scf.for %n = %c0 to %N step %c1 { + %f16 = memref.load %res[%m, %n] : !res_memref_type + %f32 = arith.extf %f16 : f16 to f32 + memref.store %f32, %tmp_alloc[%m, %n] : memref + } + } + %casted = memref.cast %tmp_alloc : memref to memref<*xf32> + call @printMemrefF32(%casted) : (memref<*xf32>) -> () + memref.dealloc %tmp_alloc : memref + return +} + +func.func @main() { + %lhs = memref.alloc() : !lhs_memref_type + %rhs = memref.alloc() : !rhs_memref_type + %res = memref.alloc() : !res_memref_type + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %M = memref.dim %res, %c0 : !res_memref_type + %N = memref.dim %res, %c1 : !res_memref_type + %K = memref.dim %lhs, %c1 : !lhs_memref_type + + %f1 = arith.constant 1.0e+00 : f16 + %f0 = arith.constant 0.0e+00 : f16 + %c32 = arith.constant 32 : index + + // Intialize the lhs matrix with a linspace function. + scf.for %r = %c0 to %M step %c1 { + scf.for %c = %c0 to %K step %c1 { + %idx = func.call @compute_linspace_val(%r, %c, %K) : (index, index, index) -> f16 + memref.store %idx, %lhs[%r, %c] : !lhs_memref_type + } + } + // Intialize the rhs matrix with a linspace function. + scf.for %r = %c0 to %K step %c1 { + scf.for %c = %c0 to %N step %c1 { + %idx = func.call @compute_linspace_val(%r, %c, %N) : (index, index, index) -> f16 + memref.store %idx, %rhs[%r, %c] : !rhs_memref_type + } + } + // Intialize the rhs matrix with a linspace function. + scf.for %r = %c0 to %M step %c1 { + scf.for %c = %c0 to %N step %c1 { + %idx = func.call @compute_linspace_val(%r, %c, %N) : (index, index, index) -> f16 + memref.store %idx, %res[%r, %c] : !res_memref_type + } + } + + %ulhs = memref.cast %lhs : !lhs_memref_type to memref<*xf16> + %urhs = memref.cast %rhs : !rhs_memref_type to memref<*xf16> + %ures = memref.cast %res : !res_memref_type to memref<*xf16> + gpu.host_register %ulhs : memref<*xf16> + gpu.host_register %urhs : memref<*xf16> + gpu.host_register %ures : memref<*xf16> + + // Print the memrefs before computation. + call @print_lhs_as_memref_32(%lhs) : (!lhs_memref_type) -> () + // CHECK: [0, 0.015625, 0.03125, 0.046875, 0.0625, 0.078125, 0.09375, 0.109375, 0.125, 0.140625, 0.15625, 0.171875, 0.1875, 0.203125, 0.21875, 0.234375], + // CHECK: [0.25, 0.265625, 0.28125, 0.296875, 0.3125, 0.328125, 0.34375, 0.359375, 0.375, 0.390625, 0.40625, 0.421875, 0.4375, 0.453125, 0.46875, 0.484375], + // CHECK: [0.5, 0.515625, 0.53125, 0.546875, 0.5625, 0.578125, 0.59375, 0.609375, 0.625, 0.640625, 0.65625, 0.671875, 0.6875, 0.703125, 0.71875, 0.734375], + // CHECK: [0.75, 0.765625, 0.78125, 0.796875, 0.8125, 0.828125, 0.84375, 0.859375, 0.875, 0.890625, 0.90625, 0.921875, 0.9375, 0.953125, 0.96875, 0.984375], + // CHECK: [1, 1.01562, 1.03125, 1.04688, 1.0625, 1.07812, 1.09375, 1.10938, 1.125, 1.14062, 1.15625, 1.17188, 1.1875, 1.20312, 1.21875, 1.23438], + // CHECK: [1.25, 1.26562, 1.28125, 1.29688, 1.3125, 1.32812, 1.34375, 1.35938, 1.375, 1.39062, 1.40625, 1.42188, 1.4375, 1.45312, 1.46875, 1.48438], + // CHECK: [1.5, 1.51562, 1.53125, 1.54688, 1.5625, 1.57812, 1.59375, 1.60938, 1.625, 1.64062, 1.65625, 1.67188, 1.6875, 1.70312, 1.71875, 1.73438], + // CHECK: [1.75, 1.76562, 1.78125, 1.79688, 1.8125, 1.82812, 1.84375, 1.85938, 1.875, 1.89062, 1.90625, 1.92188, 1.9375, 1.95312, 1.96875, 1.98438], + // CHECK: [2, 2.01562, 2.03125, 2.04688, 2.0625, 2.07812, 2.09375, 2.10938, 2.125, 2.14062, 2.15625, 2.17188, 2.1875, 2.20312, 2.21875, 2.23438], + // CHECK: [2.25, 2.26562, 2.28125, 2.29688, 2.3125, 2.32812, 2.34375, 2.35938, 2.375, 2.39062, 2.40625, 2.42188, 2.4375, 2.45312, 2.46875, 2.48438], + // CHECK: [2.5, 2.51562, 2.53125, 2.54688, 2.5625, 2.57812, 2.59375, 2.60938, 2.625, 2.64062, 2.65625, 2.67188, 2.6875, 2.70312, 2.71875, 2.73438], + // CHECK: [2.75, 2.76562, 2.78125, 2.79688, 2.8125, 2.82812, 2.84375, 2.85938, 2.875, 2.89062, 2.90625, 2.92188, 2.9375, 2.95312, 2.96875, 2.98438], + // CHECK: [3, 3.01562, 3.03125, 3.04688, 3.0625, 3.07812, 3.09375, 3.10938, 3.125, 3.14062, 3.15625, 3.17188, 3.1875, 3.20312, 3.21875, 3.23438], + // CHECK: [3.25, 3.26562, 3.28125, 3.29688, 3.3125, 3.32812, 3.34375, 3.35938, 3.375, 3.39062, 3.40625, 3.42188, 3.4375, 3.45312, 3.46875, 3.48438], + // CHECK: [3.5, 3.51562, 3.53125, 3.54688, 3.5625, 3.57812, 3.59375, 3.60938, 3.625, 3.64062, 3.65625, 3.67188, 3.6875, 3.70312, 3.71875, 3.73438], + // CHECK: [3.75, 3.76562, 3.78125, 3.79688, 3.8125, 3.82812, 3.84375, 3.85938, 3.875, 3.89062, 3.90625, 3.92188, 3.9375, 3.95312, 3.96875, 3.98438] + + call @print_rhs_as_memref_32(%rhs) : (!rhs_memref_type) -> () + // CHECK: [0, 0.015625, 0.03125, 0.046875, 0.0625, 0.078125, 0.09375, 0.109375], + // CHECK: [0.125, 0.140625, 0.15625, 0.171875, 0.1875, 0.203125, 0.21875, 0.234375], + // CHECK: [0.25, 0.265625, 0.28125, 0.296875, 0.3125, 0.328125, 0.34375, 0.359375], + // CHECK: [0.375, 0.390625, 0.40625, 0.421875, 0.4375, 0.453125, 0.46875, 0.484375], + // CHECK: [0.5, 0.515625, 0.53125, 0.546875, 0.5625, 0.578125, 0.59375, 0.609375], + // CHECK: [0.625, 0.640625, 0.65625, 0.671875, 0.6875, 0.703125, 0.71875, 0.734375], + // CHECK: [0.75, 0.765625, 0.78125, 0.796875, 0.8125, 0.828125, 0.84375, 0.859375], + // CHECK: [0.875, 0.890625, 0.90625, 0.921875, 0.9375, 0.953125, 0.96875, 0.984375], + // CHECK: [1, 1.01562, 1.03125, 1.04688, 1.0625, 1.07812, 1.09375, 1.10938], + // CHECK: [1.125, 1.14062, 1.15625, 1.17188, 1.1875, 1.20312, 1.21875, 1.23438], + // CHECK: [1.25, 1.26562, 1.28125, 1.29688, 1.3125, 1.32812, 1.34375, 1.35938], + // CHECK: [1.375, 1.39062, 1.40625, 1.42188, 1.4375, 1.45312, 1.46875, 1.48438], + // CHECK: [1.5, 1.51562, 1.53125, 1.54688, 1.5625, 1.57812, 1.59375, 1.60938], + // CHECK: [1.625, 1.64062, 1.65625, 1.67188, 1.6875, 1.70312, 1.71875, 1.73438], + // CHECK: [1.75, 1.76562, 1.78125, 1.79688, 1.8125, 1.82812, 1.84375, 1.85938], + // CHECK: [1.875, 1.89062, 1.90625, 1.92188, 1.9375, 1.95312, 1.96875, 1.98438] + + call @print_res_as_memref_32(%res) : (!res_memref_type) -> () + // CHECK: [0, 0.015625, 0.03125, 0.046875, 0.0625, 0.078125, 0.09375, 0.109375], + // CHECK: [0.125, 0.140625, 0.15625, 0.171875, 0.1875, 0.203125, 0.21875, 0.234375], + // CHECK: [0.25, 0.265625, 0.28125, 0.296875, 0.3125, 0.328125, 0.34375, 0.359375], + // CHECK: [0.375, 0.390625, 0.40625, 0.421875, 0.4375, 0.453125, 0.46875, 0.484375], + // CHECK: [0.5, 0.515625, 0.53125, 0.546875, 0.5625, 0.578125, 0.59375, 0.609375], + // CHECK: [0.625, 0.640625, 0.65625, 0.671875, 0.6875, 0.703125, 0.71875, 0.734375], + // CHECK: [0.75, 0.765625, 0.78125, 0.796875, 0.8125, 0.828125, 0.84375, 0.859375], + // CHECK: [0.875, 0.890625, 0.90625, 0.921875, 0.9375, 0.953125, 0.96875, 0.984375], + // CHECK: [1, 1.01562, 1.03125, 1.04688, 1.0625, 1.07812, 1.09375, 1.10938], + // CHECK: [1.125, 1.14062, 1.15625, 1.17188, 1.1875, 1.20312, 1.21875, 1.23438], + // CHECK: [1.25, 1.26562, 1.28125, 1.29688, 1.3125, 1.32812, 1.34375, 1.35938], + // CHECK: [1.375, 1.39062, 1.40625, 1.42188, 1.4375, 1.45312, 1.46875, 1.48438], + // CHECK: [1.5, 1.51562, 1.53125, 1.54688, 1.5625, 1.57812, 1.59375, 1.60938], + // CHECK: [1.625, 1.64062, 1.65625, 1.67188, 1.6875, 1.70312, 1.71875, 1.73438], + // CHECK: [1.75, 1.76562, 1.78125, 1.79688, 1.8125, 1.82812, 1.84375, 1.85938], + // CHECK: [1.875, 1.89062, 1.90625, 1.92188, 1.9375, 1.95312, 1.96875, 1.98438] + + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c32, %block_y = %c1, %block_z = %c1) { + + linalg.matmul ins(%lhs, %rhs: !lhs_memref_type, !rhs_memref_type) + outs(%res: !res_memref_type) + + gpu.terminator + } + + + // Print the result memref after computation. + // This has been verified against other f16 CUDA implementations. + call @print_res_as_memref_32(%res) : (!res_memref_type) -> () + // CHECK: [2.42188, 2.4668, 2.51172, 2.55664, 2.60156, 2.64648, 2.69141, 2.73633], + // CHECK: [6.29688, 6.40625, 6.51172, 6.61719, 6.72656, 6.83594, 6.94141, 7.04688], + // CHECK: [10.1719, 10.3438, 10.5156, 10.6797, 10.8516, 11.0234, 11.1875, 11.3594], + // CHECK: [14.0469, 14.2812, 14.5156, 14.7422, 14.9766, 15.2109, 15.4375, 15.6719], + // CHECK: [17.9219, 18.2188, 18.5156, 18.8125, 19.0938, 19.3906, 19.6875, 19.9844], + // CHECK: [21.7969, 22.1562, 22.5156, 22.875, 23.2188, 23.5781, 23.9375, 24.2969], + // CHECK: [25.6719, 26.0938, 26.5156, 26.9375, 27.3438, 27.7656, 28.1875, 28.6094], + // CHECK: [29.5469, 30.0312, 30.5156, 31, 31.4688, 31.9531, 32.4375, 32.9375], + // CHECK: [33.4375, 33.9688, 34.5, 35.0625, 35.5938, 36.1562, 36.6875, 37.25], + // CHECK: [37.3125, 37.9062, 38.5, 39.125, 39.7188, 40.3438, 40.9375, 41.5625], + // CHECK: [41.1875, 41.8438, 42.5, 43.1875, 43.8438, 44.5312, 45.1875, 45.875], + // CHECK: [45.0625, 45.7812, 46.5, 47.25, 47.9688, 48.7188, 49.4375, 50.1875], + // CHECK: [48.9375, 49.7188, 50.5, 51.3125, 52.0938, 52.9062, 53.6875, 54.5], + // CHECK: [52.8125, 53.6562, 54.5, 55.375, 56.2188, 57.0938, 57.9375, 58.8125], + // CHECK: [56.6875, 57.5938, 58.5, 59.4375, 60.3438, 61.2812, 62.1875, 63.125], + // CHECK: [60.5625, 61.5312, 62.5, 63.5, 64.5, 65.4375, 66.4375, 67.4375] + + return +} + +func.func private @printMemrefF32(memref<*xf32>) + +transform.sequence failures(propagate) { +^bb1(%arg1: !transform.any_op): + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + transform.nvgpu.rewrite_matmul_as_mma_sync %matmul + : (!transform.any_op) -> () +} diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir @@ -0,0 +1,178 @@ +// RUN: mlir-opt %s \ +// RUN: -test-transform-dialect-interpreter \ +// RUN: | FileCheck %s --check-prefix=CHECK-MMA-SYNC + +// CHECK-MMA-SYNC-LABEL: func @main() { +// CHECK-MMA-SYNC: nvgpu.mma.sync(%{{.*}}) {mmaShape = [16, 8, 4], tf32Enabled} +// CHECK-MMA-SYNC-SAME: : (vector<2x1xf32>, vector<1x1xf32>, vector<2x2xf32>) -> vector<2x2xf32> + +// Tested to run locally in 1.7s. + +// RUN: mlir-opt %s \ +// RUN: -test-transform-dialect-interpreter \ +// RUN: -test-transform-dialect-erase-schedule \ +// RUN: -gpu-kernel-outlining \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-vector-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -expand-strided-metadata \ +// RUN: -lower-affine \ +// RUN: -convert-index-to-llvm=index-bitwidth=32 \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-func-to-llvm \ +// RUN: -canonicalize \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm{use-opaque-pointers=1},lower-affine,convert-scf-to-cf,convert-vector-to-llvm,convert-math-to-llvm,expand-strided-metadata,lower-affine,convert-index-to-llvm{index-bitwidth=32},convert-arith-to-llvm,reconcile-unrealized-casts,gpu-to-cubin{chip=sm_80 features=+ptx76}))' \ +// RUN: | mlir-opt -convert-index-to-llvm=index-bitwidth=32 \ +// RUN: -gpu-to-llvm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_cuda_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +!lhs_memref_type = memref<16x4xf32> +!rhs_memref_type = memref<4x8xf32> +!res_memref_type = memref<16x8xf32> + +func.func @compute_linspace_val(%ridx: index, %cidx: index, %strideCidx: index) -> f32 { + %r = arith.index_cast %ridx : index to i32 + %c = arith.index_cast %cidx : index to i32 + %strideC = arith.index_cast %strideCidx : index to i32 + %2 = arith.muli %r, %strideC : i32 + %3 = arith.addi %c, %2 : i32 + %4 = arith.sitofp %3 : i32 to f32 + return %4: f32 +} + +func.func @main() { + %lhs = memref.alloc() : !lhs_memref_type + %rhs = memref.alloc() : !rhs_memref_type + %res = memref.alloc() : !res_memref_type + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %M = memref.dim %res, %c0 : !res_memref_type + %N = memref.dim %res, %c1 : !res_memref_type + %K = memref.dim %lhs, %c1 : !lhs_memref_type + + %f1 = arith.constant 1.0e+00 : f32 + %f0 = arith.constant 0.0e+00 : f32 + %c32 = arith.constant 32 : index + + // Intialize the lhs matrix with a linspace function. + scf.for %r = %c0 to %M step %c1 { + scf.for %c = %c0 to %K step %c1 { + %idx = func.call @compute_linspace_val(%r, %c, %K) : (index, index, index) -> f32 + memref.store %idx, %lhs[%r, %c] : !lhs_memref_type + } + } + // Intialize the rhs matrix with a linspace function. + scf.for %r = %c0 to %K step %c1 { + scf.for %c = %c0 to %N step %c1 { + %idx = func.call @compute_linspace_val(%r, %c, %N) : (index, index, index) -> f32 + memref.store %idx, %rhs[%r, %c] : !rhs_memref_type + } + } + // Intialize the rhs matrix with a linspace function. + scf.for %r = %c0 to %M step %c1 { + scf.for %c = %c0 to %N step %c1 { + %idx = func.call @compute_linspace_val(%r, %c, %N) : (index, index, index) -> f32 + memref.store %idx, %res[%r, %c] : !res_memref_type + } + } + + %ulhs = memref.cast %lhs : !lhs_memref_type to memref<*xf32> + %urhs = memref.cast %rhs : !rhs_memref_type to memref<*xf32> + %ures = memref.cast %res : !res_memref_type to memref<*xf32> + gpu.host_register %ulhs : memref<*xf32> + gpu.host_register %urhs : memref<*xf32> + gpu.host_register %ures : memref<*xf32> + + // Print the memrefs before computation. + call @printMemrefF32(%ulhs) : (memref<*xf32>) -> () + // CHECK: [0, 1, 2, 3], + // CHECK: [4, 5, 6, 7], + // CHECK: [8, 9, 10, 11], + // CHECK: [12, 13, 14, 15], + // CHECK: [16, 17, 18, 19], + // CHECK: [20, 21, 22, 23], + // CHECK: [24, 25, 26, 27], + // CHECK: [28, 29, 30, 31], + // CHECK: [32, 33, 34, 35], + // CHECK: [36, 37, 38, 39], + // CHECK: [40, 41, 42, 43], + // CHECK: [44, 45, 46, 47], + // CHECK: [48, 49, 50, 51], + // CHECK: [52, 53, 54, 55], + // CHECK: [56, 57, 58, 59], + // CHECK: [60, 61, 62, 63] + + call @printMemrefF32(%urhs) : (memref<*xf32>) -> () + // CHECK: [0, 1, 2, 3, 4, 5, 6, 7], + // CHECK: [8, 9, 10, 11, 12, 13, 14, 15], + // CHECK: [16, 17, 18, 19, 20, 21, 22, 23], + // CHECK: [24, 25, 26, 27, 28, 29, 30, 31] + + call @printMemrefF32(%ures) : (memref<*xf32>) -> () + // CHECK: [0, 1, 2, 3, 4, 5, 6, 7], + // CHECK: [8, 9, 10, 11, 12, 13, 14, 15], + // CHECK: [16, 17, 18, 19, 20, 21, 22, 23], + // CHECK: [24, 25, 26, 27, 28, 29, 30, 31], + // CHECK: [32, 33, 34, 35, 36, 37, 38, 39], + // CHECK: [40, 41, 42, 43, 44, 45, 46, 47], + // CHECK: [48, 49, 50, 51, 52, 53, 54, 55], + // CHECK: [56, 57, 58, 59, 60, 61, 62, 63], + // CHECK: [64, 65, 66, 67, 68, 69, 70, 71], + // CHECK: [72, 73, 74, 75, 76, 77, 78, 79], + // CHECK: [80, 81, 82, 83, 84, 85, 86, 87], + // CHECK: [88, 89, 90, 91, 92, 93, 94, 95], + // CHECK: [96, 97, 98, 99, 100, 101, 102, 103], + // CHECK: [104, 105, 106, 107, 108, 109, 110, 111], + // CHECK: [112, 113, 114, 115, 116, 117, 118, 119], + // CHECK: [120, 121, 122, 123, 124, 125, 126, 127] + + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c32, %block_y = %c1, %block_z = %c1) { + + linalg.matmul ins(%lhs, %rhs: !lhs_memref_type, !rhs_memref_type) + outs(%res: !res_memref_type) + + gpu.terminator + } + + + // Print the result memref after computation. + call @printMemrefF32(%ures) : (memref<*xf32>) -> () + + // CHECK: [112, 119, 126, 133, 140, 147, 154, 161], + // CHECK: [312, 335, 358, 381, 404, 427, 450, 473], + // CHECK: [512, 551, 590, 629, 668, 707, 746, 785], + // CHECK: [712, 767, 822, 877, 932, 987, 1042, 1097], + // CHECK: [912, 983, 1054, 1125, 1196, 1267, 1338, 1409], + // CHECK: [1112, 1199, 1286, 1373, 1460, 1547, 1634, 1721], + // CHECK: [1312, 1415, 1518, 1621, 1724, 1827, 1930, 2033], + // CHECK: [1512, 1631, 1750, 1869, 1988, 2107, 2226, 2345], + // CHECK: [1712, 1847, 1982, 2117, 2252, 2387, 2522, 2657], + // CHECK: [1912, 2063, 2214, 2365, 2516, 2667, 2818, 2969], + // CHECK: [2112, 2279, 2446, 2613, 2780, 2947, 3114, 3281], + // CHECK: [2312, 2495, 2678, 2861, 3044, 3227, 3410, 3593], + // CHECK: [2512, 2711, 2910, 3109, 3308, 3507, 3706, 3905], + // CHECK: [2712, 2927, 3142, 3357, 3572, 3787, 4002, 4217], + // CHECK: [2912, 3143, 3374, 3605, 3836, 4067, 4298, 4529], + // CHECK: [3112, 3359, 3606, 3853, 4100, 4347, 4594, 4841] + + return +} + +func.func private @printMemrefF32(memref<*xf32>) + +transform.sequence failures(propagate) { +^bb1(%arg1: !transform.any_op): + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + transform.nvgpu.rewrite_matmul_as_mma_sync %matmul + : (!transform.any_op) -> () +} diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -2758,6 +2758,64 @@ ], ) +cc_library( + name = "NVGPUTransformOps", + srcs = glob([ + "lib/Dialect/NVGPU/TransformOps/*.cpp", + ]), + hdrs = glob([ + "include/mlir/Dialect/NVGPU/TransformOps/*.h", + ]), + includes = ["include"], + deps = [ + ":ArithDialect", + ":ArithUtils", + ":AffineDialect", + ":DialectUtils", + ":GPUDialect", + ":IR", + ":LinalgDialect", + ":MemRefDialect", + ":NVGPUDialect", + ":NVGPUTransformOpsIncGen", + ":Support", + ":TransformDialect", + ":VectorDialect", + "//llvm:Support", + ], +) + +td_library( + name = "NVGPUTransformOpsTdFiles", + srcs = glob([ + "include/mlir/Dialect/NVGPU/TransformOps/*.td", + ]), + includes = ["include"], + deps = [ + ":TransformDialectTdFiles", + ], +) + +gentbl_cc_library( + name = "NVGPUTransformOpsIncGen", + strip_include_prefix = "include", + tbl_outs = [ + ( + ["-gen-op-decls"], + "include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h.inc", + ), + ( + ["-gen-op-defs"], + "include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp.inc", + ), + ], + tblgen = ":mlir-tblgen", + td_file = "include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td", + deps = [ + ":NVGPUTransformOpsTdFiles", + ], +) + cc_library( name = "NVGPUUtils", srcs = ["lib/Dialect/NVGPU/Utils/MMAUtils.cpp"], @@ -7685,6 +7743,7 @@ ":NVGPUPassIncGen", ":NVGPUToNVVM", ":NVGPUTransforms", + ":NVGPUTransformOps", ":NVVMDialect", ":OpenACCDialect", ":OpenMPDialect",