diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt --- a/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt @@ -1,2 +1,17 @@ add_mlir_dialect(NVGPU nvgpu) add_mlir_doc(NVGPU NVGPU Dialects/ -gen-dialect-doc) + +set(LLVM_TARGET_DEFINITIONS NVGPU.td) +mlir_tablegen(NVGPUEnums.h.inc -gen-enum-decls) +mlir_tablegen(NVGPUEnums.cpp.inc -gen-enum-defs) +add_public_tablegen_target(MLIRNVGPUEnumsIncGen) + +set(LLVM_TARGET_DEFINITIONS NVGPU.td) +mlir_tablegen(NVGPUAttrDefs.h.inc -gen-attrdef-decls) +mlir_tablegen(NVGPUAttrDefs.cpp.inc -gen-attrdef-defs) +add_public_tablegen_target(MLIRNVGPUAttributesIncGen) + +set(LLVM_TARGET_DEFINITIONS NVGPU.td) +mlir_tablegen(NVGPUAttrTypes.h.inc -gen-typedef-decls) +mlir_tablegen(NVGPUAttrTypes.cpp.inc -gen-typedef-decls) +add_public_tablegen_target(MLIRNVGPUTypesIncGen) diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td @@ -23,6 +23,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/AttrTypeBase.td" include "mlir/IR/OpBase.td" +include "mlir/IR/EnumAttr.td" def NVGPU_Dialect : Dialect { let name = "nvgpu"; @@ -61,6 +62,58 @@ }]; } +//===----------------------------------------------------------------------===// +// NVGPU Attribute Definitions +//===----------------------------------------------------------------------===// + +def TensorMapSwizzleNone : I32EnumAttrCase<"SWIZZLE_NONE", 0, "none">; +def TensorMapSwizzle32B : I32EnumAttrCase<"SWIZZLE_32B", 1, "swizzle_32b">; +def TensorMapSwizzle64B : I32EnumAttrCase<"SWIZZLE_64B", 2, "swizzle_64b">; +def TensorMapSwizzle128B : I32EnumAttrCase<"SWIZZLE_128B", 3, "swizzle_128b">; +def TensorMapSwizzleKind : I32EnumAttr<"TensorMapSwizzleKind", + "Tensor map swizzling mode of shared memory banks", + [ TensorMapSwizzleNone, TensorMapSwizzle32B, TensorMapSwizzle64B, + TensorMapSwizzle128B]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::nvgpu"; +} + +def TensorMapL2PromoNone : I32EnumAttrCase<"L2PROMO_NONE", 0, "none">; +def TensorMapL2Promo64B : I32EnumAttrCase<"L2PROMO_64B", 1, "l2promo_64b">; +def TensorMapL2Promo128B : I32EnumAttrCase<"L2PROMO_128B", 2, "l2promo_128b">; +def TensorMapL2Promo256B : I32EnumAttrCase<"L2PROMO_256B", 3, "l2promo_256b">; +def TensorMapL2PromoKind : I32EnumAttr<"TensorMapL2PromoKind", + "Tensor map L2 promotion type", + [ TensorMapL2PromoNone, TensorMapL2Promo64B, TensorMapL2Promo128B, + TensorMapL2Promo256B]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::nvgpu"; +} + +def TensorMapOOBZero : I32EnumAttrCase<"OOB_ZERO", 0, "zero">; +def TensorMapOOBNaN : I32EnumAttrCase<"OOB_NAN", 1, "nan">; +def TensorMapOOBKind : I32EnumAttr<"TensorMapOOBKind", + "Tensor map out-of-bounds fill type", + [ TensorMapOOBZero, TensorMapOOBNaN]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::nvgpu"; +} + +def TensorMapInterleaveNone : I32EnumAttrCase<"INTERLEAVE_NONE", 0, "none">; +def TensorMapInterleave16B : I32EnumAttrCase<"INTERLEAVE_16B", 1, "interleave_16b">; +def TensorMapInterleave32B : I32EnumAttrCase<"INTERLEAVE_32B", 2, "interleave_32b">; +def TensorMapInterleaveKind : I32EnumAttr<"TensorMapInterleaveKind", + "Tensor map interleave layout type", + [ TensorMapInterleaveNone, TensorMapInterleave16B, TensorMapInterleave32B]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::nvgpu"; +} + +def TensorMapSwizzleAttr : EnumAttr; +def TensorMapL2PromoAttr : EnumAttr; +def TensorMapOOBAttr : EnumAttr; +def TensorMapInterleaveAttr : EnumAttr; + //===----------------------------------------------------------------------===// // NVGPU Type Definitions //===----------------------------------------------------------------------===// @@ -100,6 +153,21 @@ def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { } +// https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map +def NVGPU_TensorMapDescriptor : NVGPU_Type<"TensorMapDescriptor", "tensormap.descriptor", []> { + let summary = "TensorMap descriptor"; + let parameters = (ins "MemRefType":$tensor, + EnumParameter:$swizzle, + EnumParameter:$l2promo, + EnumParameter:$oob, + EnumParameter:$interleave); + let description = [{ + `nvgpu.tma.descriptor` is a type that represents a TMA descriptor. It is + 128-byte object either in constant space or kernel paramater. + }]; + let assemblyFormat = "`<` struct(params) `>`"; +} + //===----------------------------------------------------------------------===// // NVGPU Op Definitions //===----------------------------------------------------------------------===// @@ -509,4 +577,27 @@ let assemblyFormat = "$barrier `,` $phase `,` $ticks attr-dict `:` type($barrier)"; } +def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", []> { + let summary = "TMA asynchronous load"; + let description = [{ + The Op loads a tile memory region from global memory to shared memory by + Tensor Memory Access (TMA). + + `$tensorMapDescriptor` is tensor map descriptor which has information about + tile shape. The descriptor is created by `nvgpu.tma.create.descriptor` + + The Op uses `$barrier` mbarrier based completion mechanism. + }]; + let arguments = (ins Arg:$dst, + NVGPU_MBarrier:$barrier, + NVGPU_TensorMapDescriptor:$tensorMapDescriptor, + Variadic:$coordinates); + let assemblyFormat = [{ + $tensorMapDescriptor `[` $coordinates `]` `,` $barrier `to` $dst + attr-dict `:` type($tensorMapDescriptor) `,` type($barrier) `->` type($dst) + }]; + let hasVerifier = 1; + +} + #endif // NVGPU diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h @@ -19,6 +19,11 @@ #include "mlir/IR/OpDefinition.h" #include "mlir/Interfaces/SideEffectInterfaces.h" +#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.h.inc" + +#define GET_ATTRDEF_CLASSES +#include "mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.h.inc" + #define GET_TYPEDEF_CLASSES #include "mlir/Dialect/NVGPU/IR/NVGPUTypes.h.inc" diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -413,6 +413,9 @@ converter.addConversion([&](nvgpu::MBarrierType type) -> Type { return converter.convertType(createMBarrierMemrefType(rewriter, type)); }); + converter.addConversion([&](nvgpu::TensorMapDescriptorType type) -> Type { + return converter.getPointerType(type.getTensor().getElementType()); + }); populateNVGPUToNVVMConversionPatterns(converter, patterns); LLVMConversionTarget target(getContext()); target.addLegalDialect<::mlir::LLVM::LLVMDialect>(); @@ -770,11 +773,7 @@ Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), op.getBarrier(), adaptor.getBarrier()); - Value count = adaptor.getCount(); - if (!adaptor.getCount().getType().isInteger(32)) { - count = rewriter.create(op->getLoc(), - rewriter.getI32Type(), count); - } + Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount()); if (isMbarrierShared(op.getBarrier().getType())) { rewriter.replaceOpWithNewOp(op, barrier, @@ -822,11 +821,7 @@ op.getBarrier(), adaptor.getBarrier()); Type tokenType = getTypeConverter()->convertType( nvgpu::MBarrierTokenType::get(op->getContext())); - Value count = adaptor.getCount(); - if (!adaptor.getCount().getType().isInteger(32)) { - count = rewriter.create(op->getLoc(), - rewriter.getI32Type(), count); - } + Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount()); if (isMbarrierShared(op.getBarrier().getType())) { rewriter.replaceOpWithNewOp( op, tokenType, barrier, count); @@ -910,6 +905,27 @@ } }; +struct NVGPUTmaAsyncLoadOpLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + LogicalResult + matchAndRewrite(nvgpu::TmaAsyncLoadOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto dest = rewriter.create(op->getLoc(), + adaptor.getDst(), 1); + Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), + op.getBarrier(), adaptor.getBarrier()); + + SmallVector coords = adaptor.getCoordinates(); + for (auto [index, value] : llvm::enumerate(coords)) { + coords[index] = truncToI32(rewriter, op->getLoc(), value); + } + + rewriter.replaceOpWithNewOp( + op, dest, adaptor.getTensorMapDescriptor(), barrier, coords); + return success(); + } +}; } // namespace void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter, @@ -922,6 +938,7 @@ NVGPUMBarrierTestWaitLowering, // nvgpu.mbarrier.test_wait_parity NVGPUMBarrierTryWaitParityLowering, // nvgpu.mbarrier.try_wait_parity NVGPUMBarrierArriveExpectTxLowering, // nvgpu.mbarrier.arrive.expect_tx + NVGPUTmaAsyncLoadOpLowering, // nvgpu.tma.async.load MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering, NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering, NVGPUMmaSparseSyncLowering>(converter); diff --git a/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt --- a/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt @@ -6,6 +6,9 @@ DEPENDS MLIRNVGPUIncGen + MLIRNVGPUEnumsIncGen + MLIRNVGPUAttributesIncGen + MLIRNVGPUTypesIncGen LINK_LIBS PUBLIC MLIRGPUDialect diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp --- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp +++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp @@ -14,21 +14,31 @@ #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Diagnostics.h" #include "mlir/IR/DialectImplementation.h" +#include "mlir/IR/Matchers.h" #include "mlir/IR/OpImplementation.h" +#include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/IR/Verifier.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/TypeSwitch.h" using namespace mlir; using namespace mlir::nvgpu; +#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.cpp.inc" + void nvgpu::NVGPUDialect::initialize() { addTypes< #define GET_TYPEDEF_LIST #include "mlir/Dialect/NVGPU/IR/NVGPUTypes.cpp.inc" >(); + addAttributes< +#define GET_ATTRDEF_LIST +#include "mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.cpp.inc" + >(); addOperations< #define GET_OP_LIST #include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc" @@ -320,11 +330,39 @@ return success(); } +//===----------------------------------------------------------------------===// +// NVGPU_TmaAsyncLoadOp +//===----------------------------------------------------------------------===// + +LogicalResult TmaAsyncLoadOp::verify() { + // Destination memref + auto dstMemref = llvm::cast(getDst().getType()); + if (!NVGPUDialect::hasSharedMemoryAddressSpace(dstMemref)) { + return emitError() + << "The operation stores data to shared memory, but " + "the destination memref does not have a memory space of " + << NVGPUDialect::kSharedMemoryAddressSpace; + } + if (getCoordinates().size() > 5) { + return emitError() << "Maximum 5 coordinates are supported."; + } + if (getCoordinates().size() != size_t(dstMemref.getRank())) { + return emitError() << "Destination memref rank is " + << size_t(dstMemref.getRank()) << " but there are " + << getCoordinates().size() + << " coordinates. They must match."; + } + return success(); +} + //===----------------------------------------------------------------------===// // TableGen'd dialect, type, and op definitions //===----------------------------------------------------------------------===// -#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.cpp.inc" +#define GET_ATTRDEF_CLASSES +#include "mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.cpp.inc" + +#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.cpp.inc" #define GET_OP_CLASSES #include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc" diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -559,7 +559,6 @@ func.return } - // ----- !barrierType = !nvgpu.mbarrier.barrier> !tokenType = !nvgpu.mbarrier.token @@ -603,4 +602,36 @@ nvgpu.mbarrier.try_wait.parity %barrier, %phase, %ticks : !barrierType func.return -} \ No newline at end of file +} + +// ----- + +// CHECK-LABEL: func @async_tma_load +!tensorMap1d = !nvgpu.tensormap.descriptor, swizzle=none, l2promo = none, oob = nan, interleave = interleave_16b> +!tensorMap2d = !nvgpu.tensormap.descriptor, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> +!tensorMap3d = !nvgpu.tensormap.descriptor, swizzle=swizzle_64b, l2promo = l2promo_64b, oob = zero, interleave = none> +!tensorMap4d = !nvgpu.tensormap.descriptor, swizzle=swizzle_128b,l2promo = l2promo_128b,oob = zero, interleave = none> +!tensorMap5d = !nvgpu.tensormap.descriptor, swizzle=none, l2promo = none, oob = zero, interleave = none> +!mbarrier = !nvgpu.mbarrier.barrier> +func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d, + %buffer1d: memref<128xf32,3>, + %buffer2d: memref<32x32xf32,3>, + %buffer3d: memref<2x32x32xf32,3>, + %buffer4d: memref<2x2x32x32xf32,3>, + %buffer5d: memref<2x2x2x32x32xf32,3>, + %mbarrier: !mbarrier) { + %crd0 = arith.constant 0 : index + %crd1 = arith.constant 0 : index + // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}] + nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier to %buffer1d : !tensorMap1d, !mbarrier -> memref<128xf32,3> + // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}] + nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier to %buffer2d : !tensorMap2d, !mbarrier -> memref<32x32xf32,3> + // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}] + nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier to %buffer3d : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3> + // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] + nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier to %buffer4d : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3> + // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] + nvgpu.tma.async.load %tensorMap5d[%crd0, %crd1, %crd1, %crd0, %crd0], %mbarrier to %buffer5d : !tensorMap5d, !mbarrier -> memref<2x2x2x32x32xf32,3> + func.return +} +