diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td @@ -44,6 +44,11 @@ /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`. static bool hasSharedMemoryAddressSpace(MemRefType type); + /// Return true if the given Attribute has an integer address + /// space that matches the NVVM shared memory address space or + /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`. + static bool isSharedMemoryAddressSpace(Attribute type); + /// Defines the MemRef memory space attribute numeric value that indicates /// a memref is located in global memory. This should correspond to the /// value used in NVVM. @@ -77,6 +82,15 @@ }]; } +def NVGPU_MBarrier : NVGPU_Type<"MBarrier", "mbarrier.barrier", []> { + let summary = "mbarrier barrier type"; + let description = [{}]; + let parameters = (ins "Attribute":$memorySpace); + let assemblyFormat = "`<` struct(params) `>`"; +} + +def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { } + //===----------------------------------------------------------------------===// // NVGPU Op Definitions //===----------------------------------------------------------------------===// @@ -355,4 +369,100 @@ }]; } +def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> { + let summary = "Creates a `nvgpu.mbarrier` object."; + let description = [{ + The Op generates an `mbarrier` object, which is a barrier created in + shared memory and supports various synchronization behaviors for threads. + + The `mbarrier` object has the following type and alignment requirements: + Type: .b64, Alignment: 8, Memory space: .shared + + Example: + ```mlir + %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier> + ``` + }]; + let arguments = (ins); + let results = (outs NVGPU_MBarrier:$barrier); + let assemblyFormat = [{ + attr-dict `->` type($barrier) + }]; +} + +def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> { + let summary = "Initialize the `nvgpu.mbarrier`."; + let description = [{ + The Op initializes the `mbarrier` object with the given number of threads. + + Example: + ```mlir + %num_threads = gpu.block_dim x + %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier> + nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier> + ``` + }]; + let arguments = (ins NVGPU_MBarrier:$barrier, Index:$count); + let assemblyFormat = "$barrier `,` $count attr-dict `:` type($barrier)"; +} + +def NVGPU_MBarrierTestWaitParityOp : NVGPU_Op<"mbarrier.test_wait.parity", []> { + let summary = "Checks if the `nvgpu.mbarrier` has completed its current phase."; + let description = [{ + Checks whether the mbarrier object has completed the phase. It is is a + non-blocking instruction which tests for the completion of the phase. + + The `.parity` variant of the instructions test for the completion of the + phase indicated by the operand phaseParity, which is the integer parity + of either the current phase or the immediately preceding phase of the + mbarrier object. + + Example: + ```mlir + %isComplete = nvgpu.mbarrier.test_wait.parity %barrier, %token : !nvgpu.mbarrier.barrier>, !nvgpu.mbarrier.token + ``` + }]; + let arguments = (ins NVGPU_MBarrier:$barrier, NVGPU_MBarrierToken:$token); + let results = (outs I1:$waitComplete); + let assemblyFormat = "$barrier `,` $token attr-dict `:` type($barrier) `,` type($token)"; +} + +def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> { + let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive`."; + let description = [{ + The Op performs arrive-on operation on the `mbarrier` object and returns a + `nvgpu.mbarrier.token`. + + For more information, see + https://docs.nvidia.com/cuda/parallel-thread-execution/#arrive-on-operation-on-mbarrier-object + + Example: + ```mlir + %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier> -> !nvgpu.mbarrier.token + ``` + }]; + let arguments = (ins NVGPU_MBarrier:$barrier); + let results = (outs NVGPU_MBarrierToken:$token); +let assemblyFormat = "$barrier attr-dict `:` type($barrier) `->` type($token)"; +} + +def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> { + let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive.nocomplete` as non-blocking."; + let description = [{ + The Op performs arrive-on operation on the `mbarrier` object and returns a + `nvgpu.mbarrier.token`. + + The Op does not cause the `nvgpu.mbarrier` to complete its current phase. + + Example: + ```mlir + %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier> -> !nvgpu.mbarrier.token + ``` + }]; + let arguments = (ins NVGPU_MBarrier:$barrier, + Index:$count); + let results = (outs NVGPU_MBarrierToken:$token); + let assemblyFormat = "$barrier `,` $count attr-dict `:` type($barrier) `->` type($token)"; +} + #endif // NVGPU diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -13,6 +13,7 @@ #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Pass/Pass.h" @@ -337,23 +338,69 @@ } }; +static bool isMbarrierShared(nvgpu::MBarrierType barrierType) { + return (mlir::nvgpu::NVGPUDialect::isSharedMemoryAddressSpace( + barrierType.getMemorySpace())); +} + +static Attribute getMbarrierMemorySpace(RewriterBase &rewriter, + nvgpu::MBarrierType barrierType) { + Attribute memorySpace = {}; + if (isMbarrierShared(barrierType)) { + memorySpace = rewriter.getI64IntegerAttr( + nvgpu::NVGPUDialect::kSharedMemoryAddressSpace); + } + return memorySpace; +} + +static MemRefType createMBarrierMemrefType(RewriterBase &rewriter, + nvgpu::MBarrierType barrierType) { + Attribute memorySpace = getMbarrierMemorySpace(rewriter, barrierType); + MemRefLayoutAttrInterface layout; + return MemRefType::get({1}, rewriter.getI64Type(), layout, memorySpace); +} + +static Value getMbarrierPtr(ConversionPatternRewriter &rewriter, + LLVMTypeConverter &typeConverter, + TypedValue barrier, + Value barrierMemref) { + MemRefType memrefType = createMBarrierMemrefType(rewriter, barrier.getType()); + MemRefDescriptor memRefDescriptor(barrierMemref); + return memRefDescriptor.bufferPtr(rewriter, barrier.getLoc(), typeConverter, + memrefType); +} + struct ConvertNVGPUToNVVMPass : public impl::ConvertNVGPUToNVVMPassBase { using Base::Base; + void getDependentDialects(DialectRegistry ®istry) const override { + registry + .insert(); + } + void runOnOperation() override { LowerToLLVMOptions options(&getContext()); options.useOpaquePointers = useOpaquePointers; RewritePatternSet patterns(&getContext()); LLVMTypeConverter converter(&getContext(), options); - /// device-side async tokens cannot be materialized in nvvm. We just convert - /// them to a dummy i32 type in order to easily drop them during conversion. + IRRewriter rewriter(&getContext()); + /// device-side async tokens cannot be materialized in nvvm. We just + /// convert them to a dummy i32 type in order to easily drop them during + /// conversion. converter.addConversion([&](nvgpu::DeviceAsyncTokenType type) -> Type { return converter.convertType(IntegerType::get(type.getContext(), 32)); }); + converter.addConversion([&](nvgpu::MBarrierTokenType type) -> Type { + return converter.convertType(IntegerType::get(type.getContext(), 64)); + }); + converter.addConversion([&](nvgpu::MBarrierType type) -> Type { + return converter.convertType(createMBarrierMemrefType(rewriter, type)); + }); populateNVGPUToNVVMConversionPatterns(converter, patterns); LLVMConversionTarget target(getContext()); target.addLegalDialect<::mlir::LLVM::LLVMDialect>(); + target.addLegalDialect<::mlir::memref::MemRefDialect>(); target.addLegalDialect<::mlir::NVVM::NVVMDialect>(); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) @@ -651,11 +698,155 @@ } }; +struct NVGPUMBarrierCreateLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + template + memref::GlobalOp generateGlobalBarrier(ConversionPatternRewriter &rewriter, + Operation *funcOp, moduleT moduleOp, + MemRefType barrierType) const { + SymbolTable symbolTable(moduleOp); + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(&moduleOp.front()); + auto global = rewriter.create( + funcOp->getLoc(), "__mbarrier", + /*sym_visibility=*/rewriter.getStringAttr("private"), + /*type=*/barrierType, + /*initial_value=*/ElementsAttr(), + /*constant=*/false, + /*alignment=*/rewriter.getI64IntegerAttr(8)); + symbolTable.insert(global); + return global; + } + + LogicalResult + matchAndRewrite(nvgpu::MBarrierCreateOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Operation *funcOp = op->getParentOp(); + Operation *mOp = funcOp->getParentOp(); + MemRefType barrierType = + createMBarrierMemrefType(rewriter, op.getBarrier().getType()); + + memref::GlobalOp global; + if (auto moduleOp = dyn_cast(mOp)) + global = generateGlobalBarrier(rewriter, funcOp, moduleOp, barrierType); + else if (auto moduleOp = dyn_cast(mOp)) + global = generateGlobalBarrier(rewriter, funcOp, moduleOp, barrierType); + + rewriter.setInsertionPoint(op); + rewriter.replaceOpWithNewOp(op, barrierType, + global.getName()); + return success(); + } +}; + +struct NVGPUMBarrierInitLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(nvgpu::MBarrierInitOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + rewriter.setInsertionPoint(op); + Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), + op.getBarrier(), adaptor.getBarrier()); + + Value count = adaptor.getCount(); + if (!adaptor.getCount().getType().isInteger(32)) { + count = rewriter.create(op->getLoc(), + rewriter.getI32Type(), count); + } + + if (isMbarrierShared(op.getBarrier().getType())) { + rewriter.replaceOpWithNewOp(op, barrier, + count); + } else { + rewriter.replaceOpWithNewOp(op, barrier, count); + } + return success(); + } +}; + +struct NVGPUMBarrierArriveLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + LogicalResult + matchAndRewrite(nvgpu::MBarrierArriveOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), + op.getBarrier(), adaptor.getBarrier()); + Type tokenType = getTypeConverter()->convertType( + nvgpu::MBarrierTokenType::get(op->getContext())); + if (isMbarrierShared(op.getBarrier().getType())) { + rewriter.replaceOpWithNewOp(op, tokenType, + barrier); + } else { + rewriter.replaceOpWithNewOp(op, tokenType, + barrier); + } + return success(); + } +}; + +struct NVGPUMBarrierArriveNoCompleteLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern< + nvgpu::MBarrierArriveNoCompleteOp>::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(nvgpu::MBarrierArriveNoCompleteOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), + op.getBarrier(), adaptor.getBarrier()); + Type tokenType = getTypeConverter()->convertType( + nvgpu::MBarrierTokenType::get(op->getContext())); + Value count = adaptor.getCount(); + if (!adaptor.getCount().getType().isInteger(32)) { + count = rewriter.create(op->getLoc(), + rewriter.getI32Type(), count); + } + if (isMbarrierShared(op.getBarrier().getType())) { + rewriter.replaceOpWithNewOp( + op, tokenType, barrier, count); + } else { + rewriter.replaceOpWithNewOp( + op, tokenType, barrier, count); + } + return success(); + } +}; + +struct NVGPUMBarrierTestWaitParityLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern< + nvgpu::MBarrierTestWaitParityOp>::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(nvgpu::MBarrierTestWaitParityOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), + op.getBarrier(), adaptor.getBarrier()); + Type retType = rewriter.getI1Type(); + if (isMbarrierShared(op.getBarrier().getType())) { + rewriter.replaceOpWithNewOp( + op, retType, barrier, adaptor.getToken()); + } else { + rewriter.replaceOpWithNewOp( + op, retType, barrier, adaptor.getToken()); + } + return success(); + } +}; + } // namespace void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns) { - patterns.add(converter); } diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp --- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp +++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp @@ -34,8 +34,7 @@ >(); } -bool nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) { - Attribute memorySpace = type.getMemorySpace(); +bool nvgpu::NVGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) { if (!memorySpace) return false; if (auto intAttr = llvm::dyn_cast(memorySpace)) @@ -45,6 +44,11 @@ return false; } +bool nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) { + Attribute memorySpace = type.getMemorySpace(); + return isSharedMemoryAddressSpace(memorySpace); +} + //===----------------------------------------------------------------------===// // NVGPU_DeviceAsyncCopyOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -503,3 +503,58 @@ (vector<4x4xi8>, vector<4x4xi8>, vector<2x2xi32>) -> vector<2x2xi32> return %d : vector<2x2xi32> } + +// ----- +!barrierType = !nvgpu.mbarrier.barrier> +!tokenType = !nvgpu.mbarrier.token + +// CHECK-LABEL: func @mbarrier +func.func @mbarrier() { + %num_threads = arith.constant 128 : index + + // CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier : memref<1xi64, 3> + %barrier = nvgpu.mbarrier.create -> !barrierType + + // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] + nvgpu.mbarrier.init %barrier, %num_threads : !barrierType + + // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.shared %[[barPtr2]] + %token = nvgpu.mbarrier.arrive %barrier : !barrierType -> !tokenType + + // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]] + %isDone = nvgpu.mbarrier.test_wait.parity %barrier, %token : !barrierType, !tokenType + + func.return +} + +// ----- +!barrierType = !nvgpu.mbarrier.barrier> +!tokenType = !nvgpu.mbarrier.token + +// CHECK-LABEL: func @mbarrier_nocomplete +func.func @mbarrier_nocomplete() { + %num_threads = arith.constant 128 : index + %count = arith.constant 12 : index + + // CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier : memref<1xi64, 3> + %barrier = nvgpu.mbarrier.create -> !barrierType + + // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] + nvgpu.mbarrier.init %barrier, %num_threads : !barrierType + + // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete.shared %[[barPtr2]] + %token = nvgpu.mbarrier.arrive.nocomplete %barrier, %count : !barrierType -> !tokenType + + // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]] + %isDone = nvgpu.mbarrier.test_wait.parity %barrier, %token : !barrierType, !tokenType + + func.return +} \ No newline at end of file