diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h --- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h +++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h @@ -9,6 +9,9 @@ #define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ #include "mlir/Dialect/GPU/Transforms/Utils.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Types.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/StringRef.h" #include @@ -47,6 +50,18 @@ using LoweringCallback = std::function( Operation *, llvm::LLVMContext &, StringRef)>; +struct FunctionCallBuilder { + FunctionCallBuilder(StringRef functionName, Type returnType, + ArrayRef argumentTypes) + : functionName(functionName), + functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {} + LLVM::CallOp create(Location loc, OpBuilder &builder, + ArrayRef arguments) const; + + StringRef functionName; + LLVM::LLVMFunctionType functionType; +}; + /// Collect a set of patterns to convert from the GPU dialect to LLVM and /// populate converter for gpu types. void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td @@ -74,6 +74,7 @@ "Tensor map swizzling mode of shared memory banks", [ TensorMapSwizzleNone, TensorMapSwizzle32B, TensorMapSwizzle64B, TensorMapSwizzle128B]> { + let genSpecializedAttr = 0; let cppNamespace = "::mlir::nvgpu"; } @@ -85,6 +86,7 @@ "Tensor map L2 promotion type", [ TensorMapL2PromoNone, TensorMapL2Promo64B, TensorMapL2Promo128B, TensorMapL2Promo256B]> { + let genSpecializedAttr = 0; let cppNamespace = "::mlir::nvgpu"; } @@ -93,6 +95,7 @@ def TensorMapOOBKind : I32BitEnumAttr<"TensorMapOOBKind", "Tensor map out-of-bounds fill type", [ TensorMapOOBZero, TensorMapOOBNaN]> { + let genSpecializedAttr = 0; let cppNamespace = "::mlir::nvgpu"; } @@ -102,9 +105,15 @@ def TensorMapInterleaveKind : I32BitEnumAttr<"TensorMapInterleaveKind", "Tensor map interleave layout type", [ TensorMapInterleaveNone, TensorMapInterleave16B, TensorMapInterleave32B]> { + let genSpecializedAttr = 0; let cppNamespace = "::mlir::nvgpu"; } +def TensorMapSwizzleAttr : EnumAttr; +def TensorMapL2PromoAttr : EnumAttr; +def TensorMapOOBAttr : EnumAttr; +def TensorMapInterleaveAttr : EnumAttr; + //===----------------------------------------------------------------------===// // NVGPU Type Definitions //===----------------------------------------------------------------------===// @@ -550,4 +559,28 @@ let hasVerifier = 1; } +def NVGPU_TmaCreateDescriptorOp : NVGPU_Op<"tma.create.descriptor", []> { + let summary = "TMA create descriptor"; + let description = [{ + The Op creates a tensor map descriptor object representing tiled memory + region. To do that it calls CUDA Driver's `cuTensorMapEncodeTiled`. The + descriptor is used by Tensor Memory Access (TMA). + + The `tensor` is the source tensor to be tiled. + + The `boxDimensions` is the size of the tiled memory region in each dimension. + + For more information see below: + https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html + }]; + + let arguments = (ins AnyUnrankedMemRef:$tensor, + Variadic:$boxDimensions); + let results = (outs NVGPU_TensorMapDescriptor:$tensorMap); + let assemblyFormat = [{ + $tensor `box` `[` $boxDimensions `]` attr-dict `:` type($tensor) `->` type($tensorMap) + }]; + let hasVerifier = 1; +} + #endif // NVGPU diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -58,18 +58,6 @@ void runOnOperation() override; }; -struct FunctionCallBuilder { - FunctionCallBuilder(StringRef functionName, Type returnType, - ArrayRef argumentTypes) - : functionName(functionName), - functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {} - LLVM::CallOp create(Location loc, OpBuilder &builder, - ArrayRef arguments) const; - - StringRef functionName; - LLVM::LLVMFunctionType functionType; -}; - template class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern { public: diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -8,15 +8,19 @@ #include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h" +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMTypes.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" +#include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Pass/Pass.h" +#include "llvm/Support/raw_ostream.h" namespace mlir { #define GEN_PASS_DEF_CONVERTNVGPUTONVVMPASS @@ -877,6 +881,117 @@ return success(); } }; + +/// Returns a Value that holds data type enum that is expected by CUDA driver. +static Value getTensorMapDataType(RewriterBase &rewriter, Operation *op, + Type type) { + // Enum is from CUDA driver API + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html + enum CUtensorMapDataTypeEnum { + CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0, + CU_TENSOR_MAP_DATA_TYPE_UINT16, + CU_TENSOR_MAP_DATA_TYPE_UINT32, + CU_TENSOR_MAP_DATA_TYPE_INT32, + CU_TENSOR_MAP_DATA_TYPE_UINT64, + CU_TENSOR_MAP_DATA_TYPE_INT64, + CU_TENSOR_MAP_DATA_TYPE_FLOAT16, + CU_TENSOR_MAP_DATA_TYPE_FLOAT32, + CU_TENSOR_MAP_DATA_TYPE_FLOAT64, + CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, + CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ, + CU_TENSOR_MAP_DATA_TYPE_TFLOAT32, + CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ + }; + auto makeConst = [&](int32_t index) -> Value { + return rewriter.create( + op->getLoc(), IntegerType::get(op->getContext(), 64), + rewriter.getI32IntegerAttr(index)); + }; + if (type.isUnsignedInteger(8)) + return makeConst(CU_TENSOR_MAP_DATA_TYPE_UINT8); + if (type.isUnsignedInteger(16)) + return makeConst(CU_TENSOR_MAP_DATA_TYPE_UINT16); + if (type.isUnsignedInteger(32)) + return makeConst(CU_TENSOR_MAP_DATA_TYPE_UINT32); + if (type.isUnsignedInteger(64)) + return makeConst(CU_TENSOR_MAP_DATA_TYPE_UINT64); + if (type.isSignlessInteger(32)) + return makeConst(CU_TENSOR_MAP_DATA_TYPE_INT32); + if (type.isSignlessInteger(64)) + return makeConst(CU_TENSOR_MAP_DATA_TYPE_INT64); + if (type.isF16()) + return makeConst(CU_TENSOR_MAP_DATA_TYPE_FLOAT16); + if (type.isF32()) + return makeConst(CU_TENSOR_MAP_DATA_TYPE_FLOAT32); + if (type.isF64()) + return makeConst(CU_TENSOR_MAP_DATA_TYPE_FLOAT64); + if (type.isBF16()) + return makeConst(CU_TENSOR_MAP_DATA_TYPE_BFLOAT16); + + llvm_unreachable("Not supported data type"); +} + +struct TmaCreateDescriptorOpLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern< + nvgpu::TmaCreateDescriptorOp>::ConvertOpToLLVMPattern; + LogicalResult + matchAndRewrite(nvgpu::TmaCreateDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op->getLoc(); + LLVM::LLVMPointerType llvmPointerType = getTypeConverter()->getPointerType( + IntegerType::get(op->getContext(), 8)); + Type llvmInt64Type = IntegerType::get(op->getContext(), 64); + auto makeConst = [&](int32_t index) -> Value { + return rewriter.create( + loc, llvmInt64Type, rewriter.getI32IntegerAttr(index)); + }; + Value elementType = getTensorMapDataType( + rewriter, op, op.getTensor().getType().getElementType()); + auto promotedOperands = getTypeConverter()->promoteOperands( + loc, op->getOperands(), adaptor.getOperands(), rewriter); + + Value boxArrayPtr = rewriter.create( + loc, llvmPointerType, llvmInt64Type, makeConst(5)); + for (auto [index, value] : llvm::enumerate(adaptor.getBoxDimensions())) { + Value gep = rewriter.create( + loc, llvmPointerType, llvmPointerType, boxArrayPtr, makeConst(index)); + rewriter.create(loc, value, gep); + } + + nvgpu::TensorMapDescriptorType desc = op.getTensorMap().getType(); + // Set Arguments for the function call + SmallVector arguments; + arguments.push_back(promotedOperands[0]); // rank + arguments.push_back(promotedOperands[1]); // descriptor + arguments.push_back(elementType); // data type + arguments.push_back(makeConst((int)desc.getInterleave())); // interleave + arguments.push_back(makeConst((int)desc.getSwizzle())); // swizzle + arguments.push_back(makeConst((int)desc.getL2promo())); // l2promo + arguments.push_back(makeConst((int)desc.getOob())); // oob + arguments.push_back(boxArrayPtr); // box dimensions + + // Set data types of the arguments + SmallVector argTypes = { + llvmInt64Type, /* int64_t tensorRank */ + llvmPointerType, /* ptr */ + llvmInt64Type, /* int64_t */ + llvmInt64Type, /* int64_t */ + llvmInt64Type, /* int64_t */ + llvmInt64Type, /* int64_t */ + llvmInt64Type, /* int64_t */ + llvmPointerType /* ptr */ + }; + FunctionCallBuilder hostRegisterCallBuilder = { + "mgpuTensorMapEncodeTiledMemref", llvmPointerType, argTypes}; + Value tensorMap = + hostRegisterCallBuilder.create(loc, rewriter, arguments).getResult(); + + rewriter.replaceOp(op, tensorMap); + return success(); + } +}; + } // namespace void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter, @@ -888,6 +1003,7 @@ NVGPUMBarrierArriveNoCompleteLowering, // nvgpu.mbarrier.arrive.no_complete NVGPUMBarrierTestWaitLowering, // nvgpu.try_wait_parity TmaAsyncLoadOpLowering, // nvgpu.tma.async.load + TmaCreateDescriptorOpLowering, // nvgpu.tma.create.descriptor MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering, NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering, NVGPUMmaSparseSyncLowering>(converter); diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp --- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp +++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp @@ -355,6 +355,20 @@ return success(); } +LogicalResult TmaCreateDescriptorOp::verify() { + if (getBoxDimensions().size() > 5) { + return emitError() << "Maximum 5 dimensional box is supported."; + } + + nvgpu::TensorMapDescriptorType desc = getTensorMap().getType(); + if (desc.getInterleave() != TensorMapInterleaveKind::INTERLEAVE_NONE) + return emitError() << "Interleave options are not supported yet."; + if (desc.getInterleave() != TensorMapInterleaveKind::INTERLEAVE_NONE) + return emitError() << "Interleave options are not supported yet."; + + return success(); +} + //===----------------------------------------------------------------------===// // TableGen'd dialect, type, and op definitions //===----------------------------------------------------------------------===// diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -175,6 +175,13 @@ return reinterpret_cast(ptr); } +extern "C" void *mgpuMemAllocManaged(uint64_t sizeBytes, unsigned int flags) { + ScopedContext scopedContext; + CUdeviceptr sharedPtr; + CUDA_REPORT_IF_ERROR(cuMemAllocManaged(&sharedPtr, sizeBytes, flags)); + return reinterpret_cast(sharedPtr); +} + extern "C" void mgpuMemFree(void *ptr, CUstream /*stream*/) { CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast(ptr))); } @@ -254,6 +261,70 @@ defaultDevice = device; } +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled( + CUtensorMap *tensorMap, // Tensor map object + CUtensorMapDataType tensorDataType, // Tensor data type + uint32_t tensorRank, // Dimensionality of tensor + void *globalAddress, // Starting address + uint64_t *globalDim, // Tensor size (number of elements) + uint64_t *globalStrides, // Stride size (in bytes) + uint32_t *boxDim, // Traversal box (number of elments) + uint32_t *elementStrides, // Traversal stride + CUtensorMapInterleave interleave, // Type of interleaved layout + CUtensorMapSwizzle swizzle, // Bank swizzling pattern + CUtensorMapL2promotion l2Promotion, // L2 promotion size + CUtensorMapFloatOOBfill oobFill // Padding zfill or NaN fill +) { + ScopedContext scopedContext; + CUDA_REPORT_IF_ERROR(cuTensorMapEncodeTiled( + tensorMap, tensorDataType, tensorRank, globalAddress, globalDim, + globalStrides, boxDim, elementStrides, interleave, swizzle, l2Promotion, + oobFill)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref( + int64_t tensorRank, // Dimensionality of tensor + StridedMemRefType *descriptor, // Starting address + const CUtensorMapDataType tensorDataType, // Stride size (in bytes) + CUtensorMapInterleave interleave, // Type of interleaved layout + CUtensorMapSwizzle swizzle, // Bank swizzling pattern + CUtensorMapL2promotion l2Promotion, // L2 promotion size + CUtensorMapFloatOOBfill oobFill, // Padding zfill or NaN fill + int64_t *inputBoxDims // Tensor size (number of elements) +) { + // Allocate it via managed memory so that it is accessible from the GPU. + CUtensorMap *tensorMap = + (CUtensorMap *)mgpuMemAllocManaged(sizeof(CUtensorMap), 1); + + auto *globalAddress = descriptor->data; + uint32_t *boxDim = (uint32_t *)alloca(sizeof(uint32_t) * (tensorRank)); + uint32_t *elemStride = (uint32_t *)alloca(sizeof(uint32_t) * (tensorRank)); + uint64_t *globalDim = (uint64_t *)alloca(sizeof(uint64_t) * (tensorRank)); + uint64_t *globalStrides = (uint64_t *)alloca(sizeof(uint64_t) * (tensorRank)); + uint32_t tensorRank32 = uint32_t(tensorRank); + + static const int elementSizeInBytes[] = {1, 2, 4, 4, 8, 8, 2, + 4, 8, 2, 4, 4, 4}; + for (int r = 0; r < tensorRank; ++r) { + boxDim[r] = static_cast(inputBoxDims[r]); + elemStride[r] = 1; + globalDim[r] = static_cast(descriptor->sizes[tensorRank - r - 1]); + globalStrides[tensorRank - r - 1] = static_cast( + elementSizeInBytes[tensorDataType] * globalDim[r]); + } + + globalStrides[0] = 0; + for (int r = 1; r < tensorRank; r++) + globalStrides[r] = globalDim[r - 1] * elementSizeInBytes[tensorDataType]; + + ScopedContext scopedContext; + mgpuTensorMapEncodeTiled(tensorMap, tensorDataType, tensorRank32, + globalAddress, globalDim, globalStrides, boxDim, + elemStride, interleave, swizzle, l2Promotion, + oobFill); + return reinterpret_cast(tensorMap); +} + #ifdef MLIR_ENABLE_CUDA_CUSPARSE /// @@ -486,9 +557,9 @@ struct cusparseLtSpMatHandleAndData { cusparseLtMatDescriptor_t mat; - // TODO: the following three are associated with the SpMM operator rather than - // the sparse matrix. Create workspace buffers and pass them to the SpMM - // execution. + // TODO: the following three are associated with the SpMM operator rather + // than the sparse matrix. Create workspace buffers and pass them to the + // SpMM execution. cusparseLtMatmulAlgSelection_t alg_sel; cusparseLtMatmulPlan_t plan; cusparseLtMatmulDescriptor_t matmul; diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -562,11 +562,11 @@ // ----- // CHECK-LABEL: func @async_tma_load -!tensorMap1d = !nvgpu.tensormap.descriptor, swizzle=none, l2promo = none, oob = nan, interleave = interleave_16b> +!tensorMap1d = !nvgpu.tensormap.descriptor, swizzle=none, l2promo = none, oob = nan, interleave = none> !tensorMap2d = !nvgpu.tensormap.descriptor, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> !tensorMap3d = !nvgpu.tensormap.descriptor, swizzle=swizzle_64b, l2promo = l2promo_64b, oob = zero, interleave = none> !tensorMap4d = !nvgpu.tensormap.descriptor, swizzle=swizzle_128b,l2promo = l2promo_128b,oob = zero, interleave = none> -!tensorMap5d = !nvgpu.tensormap.descriptor, swizzle=none, l2promo = none, oob = zero, interleave = none> +!tensorMap5d = !nvgpu.tensormap.descriptor, swizzle=none, l2promo = none, oob = zero, interleave = interleave_16b> !mbarrier = !nvgpu.mbarrier.barrier> func.func @async_tma_load_2d(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d, %buffer1d: memref<128xf32,3>, @@ -588,4 +588,17 @@ // CHECK : nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}] nvgpu.tma.async.load %tensorMap5d[%crd0, %crd1, %crd1, %crd0, %crd0], %mbarrier to %buffer5d : !tensorMap5d, !mbarrier -> memref<2x2x2x32x32xf32,3> func.return -} \ No newline at end of file +} + +func.func @create_tensor_map(%devicePtr2d : memref<64x128xf32>, %devicePtr1d : memref<128xf32>) { + %crd0 = arith.constant 64 : index + %crd1 = arith.constant 128 : index + %devicePtr2d_unranked = memref.cast %devicePtr2d : memref<64x128xf32> to memref<*xf32> + // CHECK : llvm.call @mgpuTensorMapEncodeTiledMemref + %tensorMap2d = nvgpu.tma.create.descriptor %devicePtr2d_unranked box[%crd0, %crd1] : memref<*xf32> -> !tensorMap2d + + %devicePtr1d_unranked = memref.cast %devicePtr1d : memref<128xf32> to memref<*xf32> + // CHECK : llvm.call @mgpuTensorMapEncodeTiledMemref + %tensorMap1d = nvgpu.tma.create.descriptor %devicePtr1d_unranked box[%crd1] : memref<*xf32> -> !tensorMap1d + func.return +}