diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -425,21 +425,22 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">, Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_i64ptr_any:$addr, LLVM_Type:$token)> { + Arguments<(ins LLVM_i64ptr_any:$addr, LLVM_Type:$state)> { string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait, {$addr, $token}); + $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait, {$addr, $state}); }]; - let assemblyFormat = "$addr `,` $token attr-dict `:` type(operands) `->` type($res)"; + let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)"; } def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">, Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_i64ptr_shared:$addr, LLVM_Type:$token)> { + Arguments<(ins LLVM_i64ptr_shared:$addr, LLVM_Type:$state)> { string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait_shared, {$addr, $token}); + $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait_shared, {$addr, $state}); }]; - let assemblyFormat = "$addr `,` $token attr-dict `:` type(operands) `->` type($res)"; + let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)"; } + //===----------------------------------------------------------------------===// // NVVM synchronization op definitions //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td @@ -44,6 +44,11 @@ /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`. static bool hasSharedMemoryAddressSpace(MemRefType type); + /// Return true if the given Attribute has an integer address + /// space that matches the NVVM shared memory address space or + /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`. + static bool isSharedMemoryAddressSpace(Attribute type); + /// Defines the MemRef memory space attribute numeric value that indicates /// a memref is located in global memory. This should correspond to the /// value used in NVVM. @@ -77,6 +82,24 @@ }]; } +def NVGPU_MBarrier : NVGPU_Type<"MBarrier", "mbarrier.barrier", []> { + let summary = "mbarrier barrier type"; + let description = [{ + This is the type for a mbarrier object in shared memory that is used + to synchronize a variable number of threads. + + The mbarrier object is 64 bit with 8 byte alignment. The mbarrier object + can be initiated and invalidated. + + See for more details: + https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object + }]; + let parameters = (ins "Attribute":$memorySpace); + let assemblyFormat = "`<` struct(params) `>`"; +} + +def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { } + //===----------------------------------------------------------------------===// // NVGPU Op Definitions //===----------------------------------------------------------------------===// @@ -355,4 +378,95 @@ }]; } +def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> { + let summary = "Creates a `nvgpu.mbarrier` object."; + let description = [{ + The Op generates an `mbarrier` object, which is a barrier created in + shared memory and supports various synchronization behaviors for threads. + + The `mbarrier` object has the following type and alignment requirements: + Type: .b64, Alignment: 8, Memory space: .shared + + Example: + ```mlir + %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier> + ``` + }]; + let arguments = (ins); + let results = (outs NVGPU_MBarrier:$barrier); + let assemblyFormat = [{ + attr-dict `->` type($barrier) + }]; +} + +def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> { + let summary = "Initialize the `nvgpu.mbarrier`."; + let description = [{ + The Op initializes the `mbarrier` object with the given number of threads. + + Example: + ```mlir + %num_threads = gpu.block_dim x + %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier> + nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier> + ``` + }]; + let arguments = (ins NVGPU_MBarrier:$barrier, Index:$count); + let assemblyFormat = "$barrier `,` $count attr-dict `:` type($barrier)"; +} + +def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> { + let summary = "Checks if the `nvgpu.mbarrier` has completed its current phase."; + let description = [{ + Checks whether the mbarrier object has completed the phase. It is is a + non-blocking instruction which tests for the completion of the phase. + + Example: + ```mlir + %isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier>, !nvgpu.mbarrier.token + ``` + }]; + let arguments = (ins NVGPU_MBarrier:$barrier, NVGPU_MBarrierToken:$token); + let results = (outs I1:$waitComplete); + let assemblyFormat = "$barrier `,` $token attr-dict `:` type($barrier) `,` type($token)"; +} + +def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> { + let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive`."; + let description = [{ + The Op performs arrive-on operation on the `mbarrier` object and returns a + `nvgpu.mbarrier.token`. + + For more information, see + https://docs.nvidia.com/cuda/parallel-thread-execution/#arrive-on-operation-on-mbarrier-object + + Example: + ```mlir + %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier> -> !nvgpu.mbarrier.token + ``` + }]; + let arguments = (ins NVGPU_MBarrier:$barrier); + let results = (outs NVGPU_MBarrierToken:$token); +let assemblyFormat = "$barrier attr-dict `:` type($barrier) `->` type($token)"; +} + +def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> { + let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive.nocomplete` as non-blocking."; + let description = [{ + The Op performs arrive-on operation on the `mbarrier` object and returns a + `nvgpu.mbarrier.token`. + + The Op does not cause the `nvgpu.mbarrier` to complete its current phase. + + Example: + ```mlir + %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier> -> !nvgpu.mbarrier.token + ``` + }]; + let arguments = (ins NVGPU_MBarrier:$barrier, + Index:$count); + let results = (outs NVGPU_MBarrierToken:$token); + let assemblyFormat = "$barrier `,` $count attr-dict `:` type($barrier) `->` type($token)"; +} + #endif // NVGPU diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -13,6 +13,7 @@ #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Pass/Pass.h" @@ -337,23 +338,74 @@ } }; +/// Returns whether mbarrier object has shared memory address space. +static bool isMbarrierShared(nvgpu::MBarrierType barrierType) { + return (mlir::nvgpu::NVGPUDialect::isSharedMemoryAddressSpace( + barrierType.getMemorySpace())); +} + +/// Returns whether memory space attribute of the mbarrier object. +static Attribute getMbarrierMemorySpace(RewriterBase &rewriter, + nvgpu::MBarrierType barrierType) { + Attribute memorySpace = {}; + if (isMbarrierShared(barrierType)) { + memorySpace = rewriter.getI64IntegerAttr( + nvgpu::NVGPUDialect::kSharedMemoryAddressSpace); + } + return memorySpace; +} + +/// Returns memref type of the mbarrier object. The type is defined in the +/// MBarrierType. +static MemRefType createMBarrierMemrefType(RewriterBase &rewriter, + nvgpu::MBarrierType barrierType) { + Attribute memorySpace = getMbarrierMemorySpace(rewriter, barrierType); + MemRefLayoutAttrInterface layout; + return MemRefType::get({1}, rewriter.getI64Type(), layout, memorySpace); +} + +/// Returns the base pointer of the mbarrier object. +static Value getMbarrierPtr(ConversionPatternRewriter &rewriter, + LLVMTypeConverter &typeConverter, + TypedValue barrier, + Value barrierMemref) { + MemRefType memrefType = createMBarrierMemrefType(rewriter, barrier.getType()); + MemRefDescriptor memRefDescriptor(barrierMemref); + return memRefDescriptor.bufferPtr(rewriter, barrier.getLoc(), typeConverter, + memrefType); +} + struct ConvertNVGPUToNVVMPass : public impl::ConvertNVGPUToNVVMPassBase { using Base::Base; + void getDependentDialects(DialectRegistry ®istry) const override { + registry + .insert(); + } + void runOnOperation() override { LowerToLLVMOptions options(&getContext()); options.useOpaquePointers = useOpaquePointers; RewritePatternSet patterns(&getContext()); LLVMTypeConverter converter(&getContext(), options); - /// device-side async tokens cannot be materialized in nvvm. We just convert - /// them to a dummy i32 type in order to easily drop them during conversion. + IRRewriter rewriter(&getContext()); + /// device-side async tokens cannot be materialized in nvvm. We just + /// convert them to a dummy i32 type in order to easily drop them during + /// conversion. converter.addConversion([&](nvgpu::DeviceAsyncTokenType type) -> Type { return converter.convertType(IntegerType::get(type.getContext(), 32)); }); + converter.addConversion([&](nvgpu::MBarrierTokenType type) -> Type { + return converter.convertType(IntegerType::get(type.getContext(), 64)); + }); + converter.addConversion([&](nvgpu::MBarrierType type) -> Type { + return converter.convertType(createMBarrierMemrefType(rewriter, type)); + }); populateNVGPUToNVVMConversionPatterns(converter, patterns); LLVMConversionTarget target(getContext()); target.addLegalDialect<::mlir::LLVM::LLVMDialect>(); + target.addLegalDialect<::mlir::memref::MemRefDialect>(); target.addLegalDialect<::mlir::NVVM::NVVMDialect>(); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) @@ -651,11 +703,164 @@ } }; +/// Creates mbarrier object in shared memory +struct NVGPUMBarrierCreateLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + template + memref::GlobalOp generateGlobalBarrier(ConversionPatternRewriter &rewriter, + Operation *funcOp, moduleT moduleOp, + MemRefType barrierType) const { + SymbolTable symbolTable(moduleOp); + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(&moduleOp.front()); + auto global = rewriter.create( + funcOp->getLoc(), "__mbarrier", + /*sym_visibility=*/rewriter.getStringAttr("private"), + /*type=*/barrierType, + /*initial_value=*/ElementsAttr(), + /*constant=*/false, + /*alignment=*/rewriter.getI64IntegerAttr(8)); + symbolTable.insert(global); + return global; + } + + LogicalResult + matchAndRewrite(nvgpu::MBarrierCreateOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Operation *funcOp = op->getParentOp(); + Operation *mOp = funcOp->getParentOp(); + MemRefType barrierType = + createMBarrierMemrefType(rewriter, op.getBarrier().getType()); + + memref::GlobalOp global; + if (auto moduleOp = dyn_cast(mOp)) + global = generateGlobalBarrier(rewriter, funcOp, moduleOp, barrierType); + else if (auto moduleOp = dyn_cast(mOp)) + global = generateGlobalBarrier(rewriter, funcOp, moduleOp, barrierType); + + rewriter.setInsertionPoint(op); + rewriter.replaceOpWithNewOp(op, barrierType, + global.getName()); + return success(); + } +}; + +/// Lowers `nvgpu.mbarrier.init` to `nvvm.mbarrier.init` +struct NVGPUMBarrierInitLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(nvgpu::MBarrierInitOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + rewriter.setInsertionPoint(op); + Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), + op.getBarrier(), adaptor.getBarrier()); + + Value count = adaptor.getCount(); + if (!adaptor.getCount().getType().isInteger(32)) { + count = rewriter.create(op->getLoc(), + rewriter.getI32Type(), count); + } + + if (isMbarrierShared(op.getBarrier().getType())) { + rewriter.replaceOpWithNewOp(op, barrier, + count); + } else { + rewriter.replaceOpWithNewOp(op, barrier, count); + } + return success(); + } +}; + +/// Lowers `nvgpu.mbarrier.arrive` to `nvvm.mbarrier.arrive` +struct NVGPUMBarrierArriveLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + LogicalResult + matchAndRewrite(nvgpu::MBarrierArriveOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), + op.getBarrier(), adaptor.getBarrier()); + Type tokenType = getTypeConverter()->convertType( + nvgpu::MBarrierTokenType::get(op->getContext())); + if (isMbarrierShared(op.getBarrier().getType())) { + rewriter.replaceOpWithNewOp(op, tokenType, + barrier); + } else { + rewriter.replaceOpWithNewOp(op, tokenType, + barrier); + } + return success(); + } +}; + +/// Lowers `nvgpu.mbarrier.arrive.nocomplete` to +/// `nvvm.mbarrier.arrive.nocomplete` +struct NVGPUMBarrierArriveNoCompleteLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern< + nvgpu::MBarrierArriveNoCompleteOp>::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(nvgpu::MBarrierArriveNoCompleteOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), + op.getBarrier(), adaptor.getBarrier()); + Type tokenType = getTypeConverter()->convertType( + nvgpu::MBarrierTokenType::get(op->getContext())); + Value count = adaptor.getCount(); + if (!adaptor.getCount().getType().isInteger(32)) { + count = rewriter.create(op->getLoc(), + rewriter.getI32Type(), count); + } + if (isMbarrierShared(op.getBarrier().getType())) { + rewriter.replaceOpWithNewOp( + op, tokenType, barrier, count); + } else { + rewriter.replaceOpWithNewOp( + op, tokenType, barrier, count); + } + return success(); + } +}; + +/// Lowers `nvgpu.mbarrier.test.wait` to `nvvm.mbarrier.test.wait` +struct NVGPUMBarrierTestWaitLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern< + nvgpu::MBarrierTestWaitOp>::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(nvgpu::MBarrierTestWaitOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), + op.getBarrier(), adaptor.getBarrier()); + Type retType = rewriter.getI1Type(); + if (isMbarrierShared(op.getBarrier().getType())) { + rewriter.replaceOpWithNewOp( + op, retType, barrier, adaptor.getToken()); + } else { + rewriter.replaceOpWithNewOp( + op, retType, barrier, adaptor.getToken()); + } + return success(); + } +}; + } // namespace void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns) { - patterns.add(converter); + patterns.add< + NVGPUMBarrierCreateLowering, // nvgpu.mbarrier.create + NVGPUMBarrierInitLowering, // nvgpu.mbarrier.init + NVGPUMBarrierArriveLowering, // nvgpu.mbarrier.arrive + NVGPUMBarrierArriveNoCompleteLowering, // nvgpu.mbarrier.arrive.no_complete + NVGPUMBarrierTestWaitLowering, // nvgpu.try_wait_parity + MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering, + NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering, + NVGPUMmaSparseSyncLowering>(converter); } diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp --- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp +++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp @@ -34,8 +34,7 @@ >(); } -bool nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) { - Attribute memorySpace = type.getMemorySpace(); +bool nvgpu::NVGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) { if (!memorySpace) return false; if (auto intAttr = llvm::dyn_cast(memorySpace)) @@ -45,6 +44,11 @@ return false; } +bool nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) { + Attribute memorySpace = type.getMemorySpace(); + return isSharedMemoryAddressSpace(memorySpace); +} + //===----------------------------------------------------------------------===// // NVGPU_DeviceAsyncCopyOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -503,3 +503,58 @@ (vector<4x4xi8>, vector<4x4xi8>, vector<2x2xi32>) -> vector<2x2xi32> return %d : vector<2x2xi32> } + +// ----- +!barrierType = !nvgpu.mbarrier.barrier> +!tokenType = !nvgpu.mbarrier.token + +// CHECK-LABEL: func @mbarrier +func.func @mbarrier() { + %num_threads = arith.constant 128 : index + + // CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier : memref<1xi64, 3> + %barrier = nvgpu.mbarrier.create -> !barrierType + + // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] + nvgpu.mbarrier.init %barrier, %num_threads : !barrierType + + // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.shared %[[barPtr2]] + %token = nvgpu.mbarrier.arrive %barrier : !barrierType -> !tokenType + + // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]] + %isDone = nvgpu.mbarrier.test.wait %barrier, %token : !barrierType, !tokenType + + func.return +} + +// ----- +!barrierType = !nvgpu.mbarrier.barrier> +!tokenType = !nvgpu.mbarrier.token + +// CHECK-LABEL: func @mbarrier_nocomplete +func.func @mbarrier_nocomplete() { + %num_threads = arith.constant 128 : index + %count = arith.constant 12 : index + + // CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier : memref<1xi64, 3> + %barrier = nvgpu.mbarrier.create -> !barrierType + + // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] + nvgpu.mbarrier.init %barrier, %num_threads : !barrierType + + // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete.shared %[[barPtr2]] + %token = nvgpu.mbarrier.arrive.nocomplete %barrier, %count : !barrierType -> !tokenType + + // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]] + %isDone = nvgpu.mbarrier.test.wait %barrier, %token : !barrierType, !tokenType + + func.return +}