diff --git a/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h b/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h --- a/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h +++ b/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h @@ -14,6 +14,7 @@ #ifndef MLIR_CONVERSION_LLVMCOMMON_LOWERINGOPTIONS_H #define MLIR_CONVERSION_LLVMCOMMON_LOWERINGOPTIONS_H +#include "mlir/IR/BuiltinTypes.h" #include "llvm/IR/DataLayout.h" namespace mlir { @@ -66,6 +67,9 @@ /// Get the index bitwidth. unsigned getIndexBitwidth() const { return indexBitwidth; } + /// Hook to customize the conversion of MemRefType to LLVMType. + llvm::function_ref memrefIndexTypeConverter = nullptr; + private: unsigned indexBitwidth; }; diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -653,10 +653,18 @@ } // namespace +static IntegerType getIndexTypeForMemRef(MemRefType type) { + if (type.getMemorySpaceAsInt() == 3) + // nvgpu::NVGPUDialect::kSharedMemoryAddressSpace) + return IntegerType::get(type.getContext(), 32); + return IntegerType::get(type.getContext(), 64); +} + void GpuToLLVMConversionPass::runOnOperation() { LowerToLLVMOptions options(&getContext()); options.useOpaquePointers = useOpaquePointers; options.useBarePtrCallConv = hostBarePtrCallConv; + options.memrefIndexTypeConverter = getIndexTypeForMemRef; LLVMTypeConverter converter(&getContext(), options); RewritePatternSet patterns(&getContext()); diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -202,6 +202,13 @@ /// Import the GPU Ops to NVVM Patterns. #include "GPUToNVVM.cpp.inc" +static IntegerType getIndexTypeForMemRef(MemRefType type) { + if (type.getMemorySpaceAsInt() == 3) + // nvgpu::NVGPUDialect::kSharedMemoryAddressSpace) + return IntegerType::get(type.getContext(), 32); + return IntegerType::get(type.getContext(), 64); +} + /// A pass that replaces all occurrences of GPU device operations with their /// corresponding NVVM equivalent. /// @@ -232,6 +239,7 @@ options.overrideIndexBitwidth(indexBitwidth); options.useOpaquePointers = useOpaquePointers; options.useBarePtrCallConv = useBarePtrCallConv; + options.memrefIndexTypeConverter = getIndexTypeForMemRef; // Apply in-dialect lowering. In-dialect lowering will replace // ops which need to be lowered further, which is not supported by a diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp --- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp +++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp @@ -339,7 +339,9 @@ } auto ptrTy = getPointerType(elementType, *addressSpace); - auto indexTy = getIndexType(); + auto indexTy = options.memrefIndexTypeConverter + ? options.memrefIndexTypeConverter(type) + : getIndexType(); SmallVector results = {ptrTy, ptrTy, indexTy}; auto rank = type.getRank(); diff --git a/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt b/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt --- a/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt +++ b/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt @@ -14,6 +14,7 @@ LINK_LIBS PUBLIC MLIRAnalysis MLIRDataLayoutInterfaces + MLIRIndexDialect MLIRLLVMCommonConversion MLIRMemRefDialect MLIRMemRefUtils diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -12,11 +12,14 @@ #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/Index/IR/IndexDialect.h" +#include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/LLVMTypes.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" +#include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Pass/Pass.h" @@ -29,15 +32,22 @@ using namespace mlir; -/// GPU has 32 bit registers, this function truncates values when larger width -/// is not needed. +/// GPU has 32 bit registers, this function truncates values when larger +/// width is not needed. static Value truncToI32(ConversionPatternRewriter &rewriter, Location loc, Value value) { Type type = value.getType(); + if (llvm::isa(type)) + return rewriter.create(loc, rewriter.getI32Type(), value); + assert(llvm::isa(type) && "expected an integer Value"); if (type.getIntOrFloatBitWidth() <= 32) return value; - return rewriter.create(loc, rewriter.getI32Type(), value); + // Avoid direct use of LVVM and instead roundtrip through index dialect which + // connects things properly. + Value index = + rewriter.create(loc, rewriter.getIndexType(), value); + return rewriter.create(loc, rewriter.getI32Type(), index); } /// Returns the type for the intrinsic given the vectorResultType of the @@ -97,8 +107,8 @@ Type f32x1Ty = LLVM::getFixedVectorType(f32Ty, 1); auto makeConst = [&](int32_t index) -> Value { - return rewriter.create(loc, IntegerType::get(ctx, 32), - rewriter.getI32IntegerAttr(index)); + return rewriter.create(loc, + rewriter.getIndexAttr(index)); }; if (arrayType) { @@ -196,8 +206,8 @@ idx < innerSize; idx++) { result.push_back(rewriter.create( loc, toUse, - rewriter.create( - loc, rewriter.getI64Type(), rewriter.getI64IntegerAttr(idx)))); + rewriter.create(loc, + rewriter.getIndexAttr(idx)))); } continue; } @@ -390,47 +400,6 @@ memrefType); } -struct ConvertNVGPUToNVVMPass - : public impl::ConvertNVGPUToNVVMPassBase { - using Base::Base; - - void getDependentDialects(DialectRegistry ®istry) const override { - registry - .insert(); - } - - void runOnOperation() override { - LowerToLLVMOptions options(&getContext()); - options.useOpaquePointers = useOpaquePointers; - RewritePatternSet patterns(&getContext()); - LLVMTypeConverter converter(&getContext(), options); - IRRewriter rewriter(&getContext()); - /// device-side async tokens cannot be materialized in nvvm. We just - /// convert them to a dummy i32 type in order to easily drop them during - /// conversion. - converter.addConversion([&](nvgpu::DeviceAsyncTokenType type) -> Type { - return converter.convertType(IntegerType::get(type.getContext(), 32)); - }); - converter.addConversion([&](nvgpu::MBarrierTokenType type) -> Type { - return converter.convertType(IntegerType::get(type.getContext(), 64)); - }); - converter.addConversion([&](nvgpu::MBarrierType type) -> Type { - return converter.convertType(createMBarrierMemrefType(rewriter, type)); - }); - converter.addConversion([&](nvgpu::TensorMapDescriptorType type) -> Type { - return converter.getPointerType(type.getTensor().getElementType()); - }); - populateNVGPUToNVVMConversionPatterns(converter, patterns); - LLVMConversionTarget target(getContext()); - target.addLegalDialect<::mlir::LLVM::LLVMDialect>(); - target.addLegalDialect<::mlir::memref::MemRefDialect>(); - target.addLegalDialect<::mlir::NVVM::NVVMDialect>(); - if (failed(applyPartialConversion(getOperation(), target, - std::move(patterns)))) - signalPassFailure(); - } -}; - /// Returns the constraints for the sparse MMA inline assembly instruction. static std::string buildMmaSparseAsmConstraintString(unsigned matASize, unsigned matBSize, @@ -655,11 +624,10 @@ // memory) of CpAsyncOp is read only for SrcElements number of elements. // The rest of the DstElements in the destination (shared memory) are // filled with zeros. - Value c3I32 = rewriter.create( - loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(3)); - Value bitwidth = rewriter.create( - loc, rewriter.getI32Type(), - rewriter.getI32IntegerAttr(srcMemrefType.getElementTypeBitWidth())); + Value c3I32 = + rewriter.create(loc, rewriter.getIndexAttr(3)); + Value bitwidth = rewriter.create( + loc, rewriter.getIndexAttr(srcMemrefType.getElementTypeBitWidth())); Value srcElementsI32 = rewriter.create(loc, rewriter.getI32Type(), srcBytes); srcBytes = rewriter.create( @@ -679,9 +647,8 @@ srcBytes); // Drop the result token. - Value zero = rewriter.create( - op->getLoc(), IntegerType::get(op.getContext(), 32), - rewriter.getI32IntegerAttr(0)); + Value zero = rewriter.create(op->getLoc(), + rewriter.getIndexAttr(0)); rewriter.replaceOp(op, zero); return success(); } @@ -697,9 +664,8 @@ ConversionPatternRewriter &rewriter) const override { rewriter.create(op.getLoc()); // Drop the result token. - Value zero = rewriter.create( - op->getLoc(), IntegerType::get(op.getContext(), 32), - rewriter.getI32IntegerAttr(0)); + Value zero = rewriter.create(op->getLoc(), + rewriter.getIndexAttr(0)); rewriter.replaceOp(op, zero); return success(); } @@ -775,8 +741,7 @@ rewriter.setInsertionPoint(op); Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), op.getBarrier(), adaptor.getBarrier()); - - Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount()); + Value count = truncToI32(rewriter, op->getLoc(), op.getCount()); if (isMbarrierShared(op.getBarrier().getType())) { rewriter.replaceOpWithNewOp(op, barrier, @@ -824,7 +789,7 @@ op.getBarrier(), adaptor.getBarrier()); Type tokenType = getTypeConverter()->convertType( nvgpu::MBarrierTokenType::get(op->getContext())); - Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount()); + Value count = truncToI32(rewriter, op->getLoc(), op.getCount()); if (isMbarrierShared(op.getBarrier().getType())) { rewriter.replaceOpWithNewOp( op, tokenType, barrier, count); @@ -869,7 +834,7 @@ ConversionPatternRewriter &rewriter) const override { Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), op.getBarrier(), adaptor.getBarrier()); - Value txcount = truncToI32(rewriter, op->getLoc(), adaptor.getTxcount()); + Value txcount = truncToI32(rewriter, op->getLoc(), op.getTxcount()); if (isMbarrierShared(op.getBarrier().getType())) { rewriter.replaceOpWithNewOp( @@ -893,8 +858,8 @@ ConversionPatternRewriter &rewriter) const override { Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), op.getBarrier(), adaptor.getBarrier()); - Value ticks = truncToI32(rewriter, op->getLoc(), adaptor.getTicks()); - Value phase = truncToI32(rewriter, op->getLoc(), adaptor.getPhase()); + Value ticks = truncToI32(rewriter, op->getLoc(), op.getTicks()); + Value phase = truncToI32(rewriter, op->getLoc(), op.getPhase()); if (isMbarrierShared(op.getBarrier().getType())) { rewriter.replaceOpWithNewOp( @@ -919,7 +884,7 @@ Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), op.getBarrier(), adaptor.getBarrier()); - SmallVector coords = adaptor.getCoordinates(); + SmallVector coords = op.getCoordinates(); for (auto [index, value] : llvm::enumerate(coords)) { coords[index] = truncToI32(rewriter, op->getLoc(), value); } @@ -930,6 +895,9 @@ } }; +/// Create an i64 LLVM constant value. This should only be used with unambiguous +/// sink operations where we know for a fact the underlying LLVM will precisely +/// want i64. static Value makeI64Const(RewriterBase &rewriter, Operation *op, int32_t index) { return rewriter.create(op->getLoc(), @@ -1063,3 +1031,57 @@ NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering, NVGPUMmaSparseSyncLowering>(converter); } + +static IntegerType getIndexTypeForMemRef(MemRefType type) { + if (type.getMemorySpaceAsInt() == + nvgpu::NVGPUDialect::kSharedMemoryAddressSpace) + return IntegerType::get(type.getContext(), 32); + return IntegerType::get(type.getContext(), 64); +} + +namespace { + +struct ConvertNVGPUToNVVMPass + : public impl::ConvertNVGPUToNVVMPassBase { + using Base::Base; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + LowerToLLVMOptions options(&getContext()); + options.useOpaquePointers = useOpaquePointers; + options.memrefIndexTypeConverter = getIndexTypeForMemRef; + RewritePatternSet patterns(&getContext()); + LLVMTypeConverter converter(&getContext(), options); + IRRewriter rewriter(&getContext()); + /// device-side async tokens cannot be materialized in nvvm. We just + /// convert them to a dummy i32 type in order to easily drop them during + /// conversion. + converter.addConversion([&](nvgpu::DeviceAsyncTokenType type) -> Type { + return converter.convertType(IntegerType::get(type.getContext(), 32)); + }); + converter.addConversion([&](nvgpu::MBarrierTokenType type) -> Type { + return converter.convertType(IntegerType::get(type.getContext(), 64)); + }); + converter.addConversion([&](nvgpu::MBarrierType type) -> Type { + return converter.convertType(createMBarrierMemrefType(rewriter, type)); + }); + converter.addConversion([&](nvgpu::TensorMapDescriptorType type) -> Type { + return converter.getPointerType(type.getTensor().getElementType()); + }); + populateNVGPUToNVVMConversionPatterns(converter, patterns); + LLVMConversionTarget target(getContext()); + target.addLegalDialect<::mlir::index::IndexDialect>(); + target.addLegalDialect<::mlir::LLVM::LLVMDialect>(); + target.addLegalDialect<::mlir::memref::MemRefDialect>(); + target.addLegalDialect<::mlir::NVVM::NVVMDialect>(); + if (failed(applyPartialConversion(getOperation(), target, + std::move(patterns)))) + signalPassFailure(); + } +}; + +} // namespace diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tmaload.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tmaload.mlir --- a/mlir/test/Integration/GPU/CUDA/sm90/tmaload.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tmaload.mlir @@ -1,15 +1,5 @@ -// RUN: mlir-opt %s --convert-nvgpu-to-nvvm -gpu-kernel-outlining \ -// RUN: -convert-scf-to-cf -convert-nvvm-to-llvm \ -// RUN: -convert-vector-to-llvm \ -// RUN: -convert-math-to-llvm \ -// RUN: -expand-strided-metadata \ -// RUN: -lower-affine \ -// RUN: -convert-index-to-llvm=index-bitwidth=32 \ -// RUN: -convert-arith-to-llvm \ -// RUN: -finalize-memref-to-llvm \ -// RUN: -convert-func-to-llvm \ -// RUN: -canonicalize \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm{use-opaque-pointers=1},lower-affine,convert-scf-to-cf,convert-vector-to-llvm,convert-math-to-llvm,expand-strided-metadata,lower-affine,convert-index-to-llvm{index-bitwidth=32},convert-arith-to-llvm,reconcile-unrealized-casts,gpu-to-cubin{chip=sm_90 features=+ptx80 dump-ptx}))' \ +// RUN: mlir-opt %s +// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_90 cubin-features=+ptx80 dump-ptx" // RUN: 2&>1 | FileCheck %s --check-prefixes=CHECK-PTX // CHECK-PTX: mbarrier.init.shared.b64 @@ -87,4 +77,4 @@ } return } -} \ No newline at end of file +} diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp --- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp +++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp @@ -20,6 +20,7 @@ #include "mlir/Conversion/MathToLLVM/MathToLLVM.h" #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" #include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h" +#include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h" #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h" #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" @@ -70,6 +71,9 @@ *this, "cubin-features", llvm::cl::desc("Features to use to serialize to cubin."), llvm::cl::init("+ptx76")}; + PassOptions::Option dumpPtx{ + *this, "dump-ptx", llvm::cl::desc("Whether to dump the produced ptx)"), + llvm::cl::init(false)}; }; //===----------------------------------------------------------------------===// @@ -124,6 +128,14 @@ pm.addNestedPass( createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt)); + // TODO: C++20 designated initializers. + ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions; + convertNVGPUToNVVMPassOptions.useOpaquePointers = true; + pm.addNestedPass( + createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions)); + + pm.addNestedPass(createConvertSCFToCFPass()); + // TODO: C++20 designated initializers. // The following pass is inconsistent. // ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions; @@ -134,13 +146,6 @@ createLowerGpuOpsToNVVMOpsPass(/*indexBitWidth=*/ options.kernelIndexBitWidth)); - // TODO: C++20 designated initializers. - ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions; - convertNVGPUToNVVMPassOptions.useOpaquePointers = true; - pm.addNestedPass( - createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions)); - pm.addNestedPass(createConvertSCFToCFPass()); - // TODO: C++20 designated initializers. GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions; // Note: hostBarePtrCallConv must be false for now otherwise @@ -164,6 +169,8 @@ pm.addNestedPass( createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions)); + pm.addNestedPass(createConvertNVVMToLLVMPass()); + // Sprinkle some cleanups. pm.addPass(createCanonicalizerPass()); pm.addPass(createCSEPass()); @@ -173,7 +180,8 @@ #if MLIR_GPU_TO_CUBIN_PASS_ENABLE pm.addNestedPass(createGpuSerializeToCubinPass( - options.cubinTriple, options.cubinChip, options.cubinFeatures)); + options.cubinTriple, options.cubinChip, options.cubinFeatures, + /*optLevel=*/2, /*dumpPtx=*/options.dumpPtx)); #endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE } @@ -182,8 +190,6 @@ //===----------------------------------------------------------------------===// // Host-specific stuff. //===----------------------------------------------------------------------===// - // Important, must be run at the top-level. - pm.addPass(createGpuKernelOutliningPass()); // Important, all host passes must be run at the func level so that host // conversions can remain with 64 bit indices without polluting the GPU @@ -228,17 +234,6 @@ pm.addNestedPass( createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions)); - // TODO: C++20 designated initializers. - ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt; - // Must be 64b on the host, things don't compose properly around - // gpu::LaunchOp and gpu::HostRegisterOp. - // TODO: fix GPU layering. - convertIndexToLLVMPassOpt.indexBitwidth = options.hostIndexBitWidth; - pm.addNestedPass( - createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt)); - - pm.addNestedPass(createArithToLLVMConversionPass()); - // Sprinkle some cleanups. pm.addNestedPass(createCanonicalizerPass()); pm.addNestedPass(createCSEPass()); @@ -246,6 +241,20 @@ //===----------------------------------------------------------------------===// // GPUModule-specific stuff. //===----------------------------------------------------------------------===// + + // Due to gpu::LaunchOp and gpu::LaunchFuncOp layering and conversions, there + // is currently a need to call convertNVGPUToNVVM at the top-level to get + // proper types at the function boundary for the TMADescriptors. + // TODO: Fix this broken layering: conversion of TMA descriptor should be + // separated from introducing LLVM types. + // TODO: C++20 designated initializers. + ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions; + convertNVGPUToNVVMPassOptions.useOpaquePointers = true; + pm.addPass(createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions)); + + // Important, must be run at the top-level. + pm.addPass(createGpuKernelOutliningPass()); + buildGpuPassPipeline(pm, options); //===----------------------------------------------------------------------===// @@ -258,6 +267,8 @@ pm.addNestedPass( createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions)); + pm.addPass(createConvertNVVMToLLVMPass()); + ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3; // Must be 64b on the host, things don't compose properly around // gpu::LaunchOp and gpu::HostRegisterOp. @@ -265,6 +276,8 @@ convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth; pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3)); + pm.addNestedPass(createArithToLLVMConversionPass()); + // This must happen after cubin translation otherwise gpu.launch_func is // illegal if no cubin annotation is present. // TODO: C++20 designated initializers.