diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h --- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h +++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h @@ -9,6 +9,9 @@ #define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ #include "mlir/Dialect/GPU/Transforms/Utils.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Types.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/StringRef.h" #include @@ -47,6 +50,18 @@ using LoweringCallback = std::function( Operation *, llvm::LLVMContext &, StringRef)>; +struct FunctionCallBuilder { + FunctionCallBuilder(StringRef functionName, Type returnType, + ArrayRef argumentTypes) + : functionName(functionName), + functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {} + LLVM::CallOp create(Location loc, OpBuilder &builder, + ArrayRef arguments) const; + + StringRef functionName; + LLVM::LLVMFunctionType functionType; +}; + /// Collect a set of patterns to convert from the GPU dialect to LLVM and /// populate converter for gpu types. void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td @@ -600,4 +600,28 @@ } +def NVGPU_TmaCreateDescriptorOp : NVGPU_Op<"tma.create.descriptor", []> { + let summary = "TMA create descriptor"; + let description = [{ + The Op creates a tensor map descriptor object representing tiled memory + region. To do that it calls CUDA Driver's `cuTensorMapEncodeTiled`. The + descriptor is used by Tensor Memory Access (TMA). + + The `tensor` is the source tensor to be tiled. + + The `boxDimensions` is the size of the tiled memory region in each dimension. + + For more information see below: + https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html + }]; + + let arguments = (ins AnyUnrankedMemRef:$tensor, + Variadic:$boxDimensions); + let results = (outs NVGPU_TensorMapDescriptor:$tensorMap); + let assemblyFormat = [{ + $tensor `box` `[` $boxDimensions `]` attr-dict `:` type($tensor) `->` type($tensorMap) + }]; + let hasVerifier = 1; +} + #endif // NVGPU diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -58,18 +58,6 @@ void runOnOperation() override; }; -struct FunctionCallBuilder { - FunctionCallBuilder(StringRef functionName, Type returnType, - ArrayRef argumentTypes) - : functionName(functionName), - functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {} - LLVM::CallOp create(Location loc, OpBuilder &builder, - ArrayRef arguments) const; - - StringRef functionName; - LLVM::LLVMFunctionType functionType; -}; - template class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern { public: diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -8,15 +8,19 @@ #include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h" +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMTypes.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" +#include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Pass/Pass.h" +#include "llvm/Support/raw_ostream.h" namespace mlir { #define GEN_PASS_DEF_CONVERTNVGPUTONVVMPASS @@ -925,6 +929,121 @@ return success(); } }; + +static Value makeI64Const(RewriterBase &rewriter, Operation *op, + int32_t index) { + return rewriter.create(op->getLoc(), + rewriter.getIntegerType(64), + rewriter.getI32IntegerAttr(index)); +} + +/// Returns a Value that holds data type enum that is expected by CUDA driver. +static Value elementTypeAsLLVMConstant(RewriterBase &rewriter, Operation *op, + Type type) { + // Enum is from CUDA driver API + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html + enum CUtensorMapDataTypeEnum { + CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0, + CU_TENSOR_MAP_DATA_TYPE_UINT16, + CU_TENSOR_MAP_DATA_TYPE_UINT32, + CU_TENSOR_MAP_DATA_TYPE_INT32, + CU_TENSOR_MAP_DATA_TYPE_UINT64, + CU_TENSOR_MAP_DATA_TYPE_INT64, + CU_TENSOR_MAP_DATA_TYPE_FLOAT16, + CU_TENSOR_MAP_DATA_TYPE_FLOAT32, + CU_TENSOR_MAP_DATA_TYPE_FLOAT64, + CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, + CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ, + CU_TENSOR_MAP_DATA_TYPE_TFLOAT32, + CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ + }; + + if (type.isUnsignedInteger(8)) + return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_UINT8); + if (type.isUnsignedInteger(16)) + return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_UINT16); + if (type.isUnsignedInteger(32)) + return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_UINT32); + if (type.isUnsignedInteger(64)) + return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_UINT64); + if (type.isSignlessInteger(32)) + return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_INT32); + if (type.isSignlessInteger(64)) + return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_INT64); + if (type.isF16()) + return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_FLOAT16); + if (type.isF32()) + return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_FLOAT32); + if (type.isF64()) + return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_FLOAT64); + if (type.isBF16()) + return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16); + + llvm_unreachable("Not supported data type"); +} + +struct NVGPUTmaCreateDescriptorOpLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern< + nvgpu::TmaCreateDescriptorOp>::ConvertOpToLLVMPattern; + LogicalResult + matchAndRewrite(nvgpu::TmaCreateDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op->getLoc(); + LLVM::LLVMPointerType llvmPointerType = getTypeConverter()->getPointerType( + IntegerType::get(op->getContext(), 8)); + Type llvmInt64Type = IntegerType::get(op->getContext(), 64); + + Value tensorElementType = elementTypeAsLLVMConstant( + rewriter, op, op.getTensor().getType().getElementType()); + auto promotedOperands = getTypeConverter()->promoteOperands( + loc, op->getOperands(), adaptor.getOperands(), rewriter); + + Value boxArrayPtr = rewriter.create( + loc, llvmPointerType, llvmInt64Type, makeI64Const(rewriter, op, 5)); + for (auto [index, value] : llvm::enumerate(adaptor.getBoxDimensions())) { + Value gep = rewriter.create( + loc, llvmPointerType, llvmPointerType, boxArrayPtr, + makeI64Const(rewriter, op, index)); + rewriter.create(loc, value, gep); + } + + nvgpu::TensorMapDescriptorType desc = op.getTensorMap().getType(); + // Set Arguments for the function call + SmallVector arguments; + arguments.push_back(promotedOperands[0]); // rank + arguments.push_back(promotedOperands[1]); // descriptor + arguments.push_back(tensorElementType); // data type + arguments.push_back( + makeI64Const(rewriter, op, (int)desc.getInterleave())); // interleave + arguments.push_back( + makeI64Const(rewriter, op, (int)desc.getSwizzle())); // swizzle + arguments.push_back( + makeI64Const(rewriter, op, (int)desc.getL2promo())); // l2promo + arguments.push_back(makeI64Const(rewriter, op, (int)desc.getOob())); // oob + arguments.push_back(boxArrayPtr); // box dimensions + + // Set data types of the arguments + SmallVector argTypes = { + llvmInt64Type, /* int64_t tensorRank */ + llvmPointerType, /* ptr */ + llvmInt64Type, /* int64_t */ + llvmInt64Type, /* int64_t */ + llvmInt64Type, /* int64_t */ + llvmInt64Type, /* int64_t */ + llvmInt64Type, /* int64_t */ + llvmPointerType /* ptr */ + }; + FunctionCallBuilder hostRegisterCallBuilder = { + "mgpuTensorMapEncodeTiledMemref", llvmPointerType, argTypes}; + Value tensorMap = + hostRegisterCallBuilder.create(loc, rewriter, arguments).getResult(); + + rewriter.replaceOp(op, tensorMap); + return success(); + } +}; + } // namespace void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter, @@ -936,6 +1055,8 @@ NVGPUMBarrierArriveNoCompleteLowering, // nvgpu.mbarrier.arrive.no_complete NVGPUMBarrierTestWaitLowering, // nvgpu.mbarrier.test_wait_parity NVGPUMBarrierTryWaitParityLowering, // nvgpu.mbarrier.try_wait_parity + NVGPUTmaAsyncLoadOpLowering, // nvgpu.tma.async.load + NVGPUTmaCreateDescriptorOpLowering, // nvgpu.tma.create.descriptor NVGPUMBarrierArriveExpectTxLowering, // nvgpu.mbarrier.arrive.expect_tx NVGPUTmaAsyncLoadOpLowering, // nvgpu.tma.async.load MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering, diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp --- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp +++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp @@ -355,6 +355,17 @@ return success(); } +LogicalResult TmaCreateDescriptorOp::verify() { + if (getBoxDimensions().size() > 5) { + return emitError() << "Maximum 5 dimensional box is supported."; + } + nvgpu::TensorMapDescriptorType desc = getTensorMap().getType(); + if (desc.getInterleave() != TensorMapInterleaveKind::INTERLEAVE_NONE) + return emitError() << "Interleave options are not supported yet."; + + return success(); +} + //===----------------------------------------------------------------------===// // TableGen'd dialect, type, and op definitions //===----------------------------------------------------------------------===// diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -254,6 +254,71 @@ defaultDevice = device; } +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled( + CUtensorMap *tensorMap, // Tensor map object + CUtensorMapDataType tensorDataType, // Tensor data type + cuuint32_t tensorRank, // Dimensionality of tensor + void *globalAddress, // Starting address + const cuuint64_t *globalDim, // Tensor size (number of elements) + const cuuint64_t *globalStrides, // Stride size (in bytes) + const cuuint32_t *boxDim, // Traversal box (number of elments) + const cuuint32_t *elementStrides, // Traversal stride + CUtensorMapInterleave interleave, // Type of interleaved layout + CUtensorMapSwizzle swizzle, // Bank swizzling pattern + CUtensorMapL2promotion l2Promotion, // L2 promotion size + CUtensorMapFloatOOBfill oobFill // Padding zfill or NaN fill +) { + ScopedContext scopedContext; + CUDA_REPORT_IF_ERROR(cuTensorMapEncodeTiled( + tensorMap, tensorDataType, tensorRank, globalAddress, globalDim, + globalStrides, boxDim, elementStrides, interleave, swizzle, l2Promotion, + oobFill)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref( + int64_t tensorRank, // Dimensionality of tensor + StridedMemRefType *descriptor, // Starting address + const CUtensorMapDataType tensorDataType, // Stride size (in bytes) + CUtensorMapInterleave interleave, // Type of interleaved layout + CUtensorMapSwizzle swizzle, // Bank swizzling pattern + CUtensorMapL2promotion l2Promotion, // L2 promotion size + CUtensorMapFloatOOBfill oobFill, // Padding zfill or NaN fill + int64_t *inputBoxDims // Tensor size (number of elements) +) { + CUtensorMap tensorMap; + + auto *globalAddress = descriptor->data; + uint32_t boxDim[5] = {0}, elementStrides[5] = {0}; + uint64_t globalDim[5] = {0}, globalStrides[5] = {0}; + uint32_t tensorRank32 = uint32_t(tensorRank); + + static const int elementSizeInBytes[] = {1, 2, 4, 4, 8, 8, 2, + 4, 8, 2, 4, 4, 4}; + for (int64_t r = 0; r < tensorRank; ++r) { + elementStrides[r] = uint32_t(1); + boxDim[r] = static_cast(inputBoxDims[tensorRank - r - 1]); + globalDim[r] = static_cast(descriptor->sizes[tensorRank - r - 1]); + } + + globalStrides[0] = globalDim[0] * elementSizeInBytes[tensorDataType]; + for (int r = 1; r < tensorRank - 1; r++) + globalStrides[r] = globalStrides[r - 1] * globalDim[1] * + elementSizeInBytes[tensorDataType]; + + ScopedContext scopedContext; + mgpuTensorMapEncodeTiled(&tensorMap, tensorDataType, tensorRank32, + globalAddress, globalDim, globalStrides, boxDim, + elementStrides, interleave, swizzle, l2Promotion, + oobFill); + // Copy created tensor map to device + CUdeviceptr dTensorMap; + CUDA_REPORT_IF_ERROR(cuMemAlloc(&dTensorMap, sizeof(CUtensorMap))); + CUDA_REPORT_IF_ERROR(cuMemcpy(dTensorMap, + reinterpret_cast(&tensorMap), + sizeof(CUtensorMap))); + return reinterpret_cast(dTensorMap); +} + #ifdef MLIR_ENABLE_CUDA_CUSPARSE /// diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -607,10 +607,10 @@ // ----- // CHECK-LABEL: func @async_tma_load -!tensorMap1d = !nvgpu.tensormap.descriptor, swizzle=none, l2promo = none, oob = nan, interleave = interleave_16b> +!tensorMap1d = !nvgpu.tensormap.descriptor, swizzle=none, l2promo = none, oob = nan, interleave = none> !tensorMap2d = !nvgpu.tensormap.descriptor, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> !tensorMap3d = !nvgpu.tensormap.descriptor, swizzle=swizzle_64b, l2promo = l2promo_64b, oob = zero, interleave = none> -!tensorMap4d = !nvgpu.tensormap.descriptor, swizzle=swizzle_128b,l2promo = l2promo_128b,oob = zero, interleave = none> +!tensorMap4d = !nvgpu.tensormap.descriptor, swizzle=swizzle_128b,l2promo = l2promo_128b,oob = zero, interleave = interleave_16b> !tensorMap5d = !nvgpu.tensormap.descriptor, swizzle=none, l2promo = none, oob = zero, interleave = none> !mbarrier = !nvgpu.mbarrier.barrier> func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d, @@ -635,18 +635,15 @@ func.return } -// ----- - -!barrierType = !nvgpu.mbarrier.barrier> -module @find_parent{ - func.func @main() { - %c1 = arith.constant 1 : index - gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) - threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1, %block_z = %c1) { - // CHECK: memref.get_global @__mbarrier : memref<1xi64, 3> - %barrier = nvgpu.mbarrier.create -> !barrierType - gpu.terminator - } - func.return - } +func.func @create_tensor_map(%devicePtr2d : memref<64x128xf32>, %devicePtr1d : memref<128xf32>) { + %crd0 = arith.constant 64 : index + %crd1 = arith.constant 128 : index + %devicePtr2d_unranked = memref.cast %devicePtr2d : memref<64x128xf32> to memref<*xf32> + // CHECK : llvm.call @mgpuTensorMapEncodeTiledMemref + %tensorMap2d = nvgpu.tma.create.descriptor %devicePtr2d_unranked box[%crd0, %crd1] : memref<*xf32> -> !tensorMap2d + + %devicePtr1d_unranked = memref.cast %devicePtr1d : memref<128xf32> to memref<*xf32> + // CHECK : llvm.call @mgpuTensorMapEncodeTiledMemref + %tensorMap1d = nvgpu.tma.create.descriptor %devicePtr1d_unranked box[%crd1] : memref<*xf32> -> !tensorMap1d + func.return }