Diff 539126

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

Show First 20 Lines • Show All 419 Lines • ▼ Show 20 Lines	const char* $cppClass::getPtx() {
"selp.b32 %0, 1, 0, P1; \n\t"		"selp.b32 %0, 1, 0, P1; \n\t"
"}";		"}";
}		}
}];		}];
}		}

def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,		def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
Results<(outs LLVM_Type:$res)>,		Results<(outs LLVM_Type:$res)>,
Arguments<(ins LLVM_i64ptr_any:$addr, LLVM_Type:$token)> {		Arguments<(ins LLVM_i64ptr_any:$addr, LLVM_Type:$state)> {
string llvmBuilder = [{		string llvmBuilder = [{
$res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait, {$addr, $token});		$res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait, {$addr, $state});
}];		}];
let assemblyFormat = "$addr `,` $token attr-dict `:` type(operands) `->` type($res)";		let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
}		}

def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">,		def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">,
Results<(outs LLVM_Type:$res)>,		Results<(outs LLVM_Type:$res)>,
Arguments<(ins LLVM_i64ptr_shared:$addr, LLVM_Type:$token)> {		Arguments<(ins LLVM_i64ptr_shared:$addr, LLVM_Type:$state)> {
string llvmBuilder = [{		string llvmBuilder = [{
$res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait_shared, {$addr, $token});		$res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait_shared, {$addr, $state});
}];		}];
let assemblyFormat = "$addr `,` $token attr-dict `:` type(operands) `->` type($res)";		let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// NVVM synchronization op definitions		// NVVM synchronization op definitions
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {		def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
string llvmBuilder = [{		string llvmBuilder = [{
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier0);		createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier0);
}];		}];
▲ Show 20 Lines • Show All 944 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td

Show All 38 Lines	def NVGPU_Dialect : Dialect {
let usePropertiesForAttributes = 1;		let usePropertiesForAttributes = 1;

let extraClassDeclaration = [{		let extraClassDeclaration = [{
/// Return true if the given MemRefType has an integer address		/// Return true if the given MemRefType has an integer address
/// space that matches the NVVM shared memory address space or		/// space that matches the NVVM shared memory address space or
/// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.		/// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
static bool hasSharedMemoryAddressSpace(MemRefType type);		static bool hasSharedMemoryAddressSpace(MemRefType type);

		/// Return true if the given Attribute has an integer address
		/// space that matches the NVVM shared memory address space or
		/// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
		static bool isSharedMemoryAddressSpace(Attribute type);

/// Defines the MemRef memory space attribute numeric value that indicates		/// Defines the MemRef memory space attribute numeric value that indicates
/// a memref is located in global memory. This should correspond to the		/// a memref is located in global memory. This should correspond to the
/// value used in NVVM.		/// value used in NVVM.
static constexpr unsigned kGlobaldMemoryAddressSpace = 1;		static constexpr unsigned kGlobaldMemoryAddressSpace = 1;

/// Defines the MemRef memory space attribute numeric value that indicates		/// Defines the MemRef memory space attribute numeric value that indicates
/// a memref is located in shared memory. This should correspond to the		/// a memref is located in shared memory. This should correspond to the
/// value used in NVVM.		/// value used in NVVM.
Show All 17 Lines	let description = [{
`nvgpu.device.async.token` is a type returned by an asynchronous operation		`nvgpu.device.async.token` is a type returned by an asynchronous operation
that runs on the GPU (device). It is used to establish an SSA-based link		that runs on the GPU (device). It is used to establish an SSA-based link
between the async operation (e.g. DeviceAsyncCopy) and operations that		between the async operation (e.g. DeviceAsyncCopy) and operations that
group or synchronize the async operations (e.g. DeviceAsyncCreateGroupOp,		group or synchronize the async operations (e.g. DeviceAsyncCreateGroupOp,
DeviceAsyncWaitOp).		DeviceAsyncWaitOp).
}];		}];
}		}

		def NVGPU_MBarrier : NVGPU_Type<"MBarrier", "mbarrier.barrier", []> {
		let summary = "mbarrier barrier type";
		let description = [{
		nicolasvasilacheUnsubmitted Done Reply Inline Actions doc please: `this is the type for a barrier in shared memory that is used to synchronize a variable number of threads. Conceptually it behaves similarly to C++ std::barrier<T>::arrive_and_wait` etc nicolasvasilache: doc please: `this is the type for a barrier in shared memory that is used to synchronize a…
		This is the type for a mbarrier object in shared memory that is used
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions can we drop the first sentence? it seems redundant nicolasvasilache: can we drop the first sentence? it seems redundant
		to synchronize a variable number of threads.

		The mbarrier object is 64 bit with 8 byte alignment. The mbarrier object
		can be initiated and invalidated.

		See for more details:
		https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object
		}];
		let parameters = (ins "Attribute":$memorySpace);
		let assemblyFormat = "`<` struct(params) `>`";
		}

		def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { }

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// NVGPU Op Definitions		// NVGPU Op Definitions
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

class NVGPU_Op<string mnemonic, list<Trait> traits = []> :		class NVGPU_Op<string mnemonic, list<Trait> traits = []> :
Op<NVGPU_Dialect, mnemonic, traits> {}		Op<NVGPU_Dialect, mnemonic, traits> {}

def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [		def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
▲ Show 20 Lines • Show All 262 Lines • ▼ Show 20 Lines	def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
}];		}];
let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,		let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
OptionalAttr<I32Attr>:$numGroups);		OptionalAttr<I32Attr>:$numGroups);
let assemblyFormat = [{		let assemblyFormat = [{
$asyncDependencies attr-dict		$asyncDependencies attr-dict
}];		}];
}		}

		def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> {
		let summary = "Creates a `nvgpu.mbarrier` object.";
		let description = [{
		The Op generates an `mbarrier` object, which is a barrier created in
		shared memory and supports various synchronization behaviors for threads.

		The `mbarrier` object has the following type and alignment requirements:
		Type: .b64, Alignment: 8, Memory space: .shared

		Example:
		```mlir
		%barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
		```
		}];
		let arguments = (ins);
		let results = (outs NVGPU_MBarrier:$barrier);
		let assemblyFormat = [{
		attr-dict `->` type($barrier)
		}];
		}

		def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> {
		let summary = "Initialize the `nvgpu.mbarrier`.";
		let description = [{
		The Op initializes the `mbarrier` object with the given number of threads.

		Example:
		```mlir
		%num_threads = gpu.block_dim x
		%barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
		nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
		qcolombetUnsubmitted Not Done Reply Inline Actions Can we use `init` on the same `barrier` several times? E.g., barrier = mbarrier.create mbarrier.init barrier // RegionA: Do some stuff with barrier mbarrier.init barrier // RegionB: Do some other stuff with barrier Ultimately what I am wondering is what kind of "effects" we should set on this operation, and/or how we should model the dependency on barrier. (like init is a `store`-like operation and other `mbarrier` operations are `load`-like operations on the given barrier?) For instance, in my snippet, we wouldn't want to move some code from `RegionB` in `RegionA`. qcolombet: Can we use `init` on the same `barrier` several times? E.g., ``` barrier = mbarrier.create…
		```
		}];
		let arguments = (ins NVGPU_MBarrier:$barrier, Index:$count);
		let assemblyFormat = "$barrier `,` $count attr-dict `:` type($barrier)";
		}

		def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> {
		let summary = "Checks if the `nvgpu.mbarrier` has completed its current phase.";
		let description = [{
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Can we document what "parity" means here ? nicolasvasilache: Can we document what "parity" means here ?
		Checks whether the mbarrier object has completed the phase. It is is a
		non-blocking instruction which tests for the completion of the phase.

		Example:
		```mlir
		%isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>, !nvgpu.mbarrier.token
		```
		}];
		let arguments = (ins NVGPU_MBarrier:$barrier, NVGPU_MBarrierToken:$token);
		let results = (outs I1:$waitComplete);
		let assemblyFormat = "$barrier `,` $token attr-dict `:` type($barrier) `,` type($token)";
		}

		def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> {
		let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive`.";
		let description = [{
		The Op performs arrive-on operation on the `mbarrier` object and returns a
		`nvgpu.mbarrier.token`.

		For more information, see
		https://docs.nvidia.com/cuda/parallel-thread-execution/#arrive-on-operation-on-mbarrier-object

		Example:
		```mlir
		%token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
		```
		}];
		let arguments = (ins NVGPU_MBarrier:$barrier);
		let results = (outs NVGPU_MBarrierToken:$token);
		let assemblyFormat = "$barrier attr-dict `:` type($barrier) `->` type($token)";
		}

		def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> {
		let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive.nocomplete` as non-blocking.";
		let description = [{
		The Op performs arrive-on operation on the `mbarrier` object and returns a
		`nvgpu.mbarrier.token`.

		The Op does not cause the `nvgpu.mbarrier` to complete its current phase.

		Example:
		```mlir
		%token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
		```
		}];
		let arguments = (ins NVGPU_MBarrier:$barrier,
		Index:$count);
		let results = (outs NVGPU_MBarrierToken:$token);
		let assemblyFormat = "$barrier `,` $count attr-dict `:` type($barrier) `->` type($token)";
		}

#endif // NVGPU		#endif // NVGPU

mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

//===- NVGPUToNVVM.cpp - NVGPU to NVVM dialect conversion -----------------===//		//===- NVGPUToNVVM.cpp - NVGPU to NVVM dialect conversion -----------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"		#include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"

#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"		#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"		#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"		#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"		#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"		#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
		#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"		#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/IR/TypeUtilities.h"		#include "mlir/IR/TypeUtilities.h"
#include "mlir/Pass/Pass.h"		#include "mlir/Pass/Pass.h"

namespace mlir {		namespace mlir {
#define GEN_PASS_DEF_CONVERTNVGPUTONVVMPASS		#define GEN_PASS_DEF_CONVERTNVGPUTONVVMPASS
#include "mlir/Conversion/Passes.h.inc"		#include "mlir/Conversion/Passes.h.inc"
} // namespace mlir		} // namespace mlir
▲ Show 20 Lines • Show All 308 Lines • ▼ Show 20 Lines	Value intrinsicResult = rewriter.create<NVVM::MmaOp>(
NVVM::MMALayout::col});		NVVM::MMALayout::col});
rewriter.replaceOp(op, convertIntrinsicResult(op.getLoc(), intrinsicResTy,		rewriter.replaceOp(op, convertIntrinsicResult(op.getLoc(), intrinsicResTy,
desiredRetTy, intrinsicResult,		desiredRetTy, intrinsicResult,
rewriter));		rewriter));
return success();		return success();
}		}
};		};

		/// Returns whether mbarrier object has shared memory address space.
		static bool isMbarrierShared(nvgpu::MBarrierType barrierType) {
		return (mlir::nvgpu::NVGPUDialect::isSharedMemoryAddressSpace(
		barrierType.getMemorySpace()));
		}

		qcolombetUnsubmitted Not Done Reply Inline Actions Should we return an optional here? Put differently, what should the user of this API do when they receive `{}`? qcolombet: Should we return an optional here? Put differently, what should the user of this API do when…
		/// Returns whether memory space attribute of the mbarrier object.
		static Attribute getMbarrierMemorySpace(RewriterBase &rewriter,
		nvgpu::MBarrierType barrierType) {
		Attribute memorySpace = {};
		if (isMbarrierShared(barrierType)) {
		memorySpace = rewriter.getI64IntegerAttr(
		nvgpu::NVGPUDialect::kSharedMemoryAddressSpace);
		}
		return memorySpace;
		}

		/// Returns memref type of the mbarrier object. The type is defined in the
		/// MBarrierType.
		static MemRefType createMBarrierMemrefType(RewriterBase &rewriter,
		nvgpu::MBarrierType barrierType) {
		Attribute memorySpace = getMbarrierMemorySpace(rewriter, barrierType);
		MemRefLayoutAttrInterface layout;
		return MemRefType::get({1}, rewriter.getI64Type(), layout, memorySpace);
		}

		/// Returns the base pointer of the mbarrier object.
		static Value getMbarrierPtr(ConversionPatternRewriter &rewriter,
		LLVMTypeConverter &typeConverter,
		TypedValue<nvgpu::MBarrierType> barrier,
		Value barrierMemref) {
		MemRefType memrefType = createMBarrierMemrefType(rewriter, barrier.getType());
		MemRefDescriptor memRefDescriptor(barrierMemref);
		return memRefDescriptor.bufferPtr(rewriter, barrier.getLoc(), typeConverter,
		memrefType);
		}

struct ConvertNVGPUToNVVMPass		struct ConvertNVGPUToNVVMPass
: public impl::ConvertNVGPUToNVVMPassBase<ConvertNVGPUToNVVMPass> {		: public impl::ConvertNVGPUToNVVMPassBase<ConvertNVGPUToNVVMPass> {
using Base::Base;		using Base::Base;

		void getDependentDialects(DialectRegistry &registry) const override {
		registry
		.insert<memref::MemRefDialect, LLVM::LLVMDialect, NVVM::NVVMDialect>();
		}

void runOnOperation() override {		void runOnOperation() override {
LowerToLLVMOptions options(&getContext());		LowerToLLVMOptions options(&getContext());
options.useOpaquePointers = useOpaquePointers;		options.useOpaquePointers = useOpaquePointers;
RewritePatternSet patterns(&getContext());		RewritePatternSet patterns(&getContext());
LLVMTypeConverter converter(&getContext(), options);		LLVMTypeConverter converter(&getContext(), options);
/// device-side async tokens cannot be materialized in nvvm. We just convert		IRRewriter rewriter(&getContext());
/// them to a dummy i32 type in order to easily drop them during conversion.		/// device-side async tokens cannot be materialized in nvvm. We just
		/// convert them to a dummy i32 type in order to easily drop them during
		/// conversion.
converter.addConversion([&](nvgpu::DeviceAsyncTokenType type) -> Type {		converter.addConversion([&](nvgpu::DeviceAsyncTokenType type) -> Type {
return converter.convertType(IntegerType::get(type.getContext(), 32));		return converter.convertType(IntegerType::get(type.getContext(), 32));
});		});
		converter.addConversion([&](nvgpu::MBarrierTokenType type) -> Type {
		return converter.convertType(IntegerType::get(type.getContext(), 64));
		});
		converter.addConversion([&](nvgpu::MBarrierType type) -> Type {
		return converter.convertType(createMBarrierMemrefType(rewriter, type));
		});
populateNVGPUToNVVMConversionPatterns(converter, patterns);		populateNVGPUToNVVMConversionPatterns(converter, patterns);
LLVMConversionTarget target(getContext());		LLVMConversionTarget target(getContext());
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();		target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
		target.addLegalDialect<::mlir::memref::MemRefDialect>();
target.addLegalDialect<::mlir::NVVM::NVVMDialect>();		target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
if (failed(applyPartialConversion(getOperation(), target,		if (failed(applyPartialConversion(getOperation(), target,
std::move(patterns))))		std::move(patterns))))
signalPassFailure();		signalPassFailure();
}		}
};		};

/// Returns the constraints for the sparse MMA inline assembly instruction.		/// Returns the constraints for the sparse MMA inline assembly instruction.
▲ Show 20 Lines • Show All 281 Lines • ▼ Show 20 Lines	matchAndRewrite(nvgpu::DeviceAsyncWaitOp op, OpAdaptor adaptor,
// If numGroup is not present pick 0 as a conservative correct value.		// If numGroup is not present pick 0 as a conservative correct value.
int32_t numGroups = adaptor.getNumGroups().value_or(0);		int32_t numGroups = adaptor.getNumGroups().value_or(0);
rewriter.create<NVVM::CpAsyncWaitGroupOp>(op.getLoc(), numGroups);		rewriter.create<NVVM::CpAsyncWaitGroupOp>(op.getLoc(), numGroups);
rewriter.eraseOp(op);		rewriter.eraseOp(op);
return success();		return success();
}		}
};		};

		/// Creates mbarrier object in shared memory
		nicolasvasilacheUnsubmitted Done Reply Inline Actions Can we add a detailed enough comment to justify the current choice fo memref::GlobalOp, vs e.g. AllocaOp ? nicolasvasilache: Can we add a detailed enough comment to justify the current choice fo memref::GlobalOp, vs e.g.
		struct NVGPUMBarrierCreateLowering
		: public ConvertOpToLLVMPattern<nvgpu::MBarrierCreateOp> {
		using ConvertOpToLLVMPattern<nvgpu::MBarrierCreateOp>::ConvertOpToLLVMPattern;

		template <typename moduleT>
		memref::GlobalOp generateGlobalBarrier(ConversionPatternRewriter &rewriter,
		Operation *funcOp, moduleT moduleOp,
		MemRefType barrierType) const {
		SymbolTable symbolTable(moduleOp);
		OpBuilder::InsertionGuard guard(rewriter);
		rewriter.setInsertionPoint(&moduleOp.front());
		gurayppAuthorUnsubmitted Done Reply Inline Actions `mbarrier` object needs to be created in static shared memory. Here, I created `memref::GlobalOp` to match it with clang. See an example: https://godbolt.org/z/T4Gonnoez clang generates static shared memory value globally like below @arrive(int)::bar = internal unnamed_addr addrspace(3) global i64 undef, align 8 guraypp:* `mbarrier` object needs to be created in static shared memory. Here, I created `memref…
		gurayppAuthorUnsubmitted Done Reply Inline Actions Initially, I attempted to use `memref.alloca` with address space 3, but it resulted in an incorrect PTX. See an example below: func.func @do_alloca() { %wg = memref.alloca() {alignment = 8} : memref<1xi64, 3> %c128 = arith.constant 128 : i32 %2 = builtin.unrealized_conversion_cast %wg : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> %3 = llvm.extractvalue %2[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> nvvm.mbarrier.init.shared %3, %c128 : !llvm.ptr<3>, i32 func.return } I got the following LLVM IR, which looked right to me. The `alloca` has `addrspace(3)`. define void @do_alloca() { %1 = alloca i64, i64 1, align 8, addrspace(3) %2 = insertvalue { ptr addrspace(3), ptr addrspace(3), i64, [1 x i64], [1 x i64] } undef, ptr addrspace(3) %1, 0 %3 = insertvalue { ptr addrspace(3), ptr addrspace(3), i64, [1 x i64], [1 x i64] } %2, ptr addrspace(3) %1, 1 %4 = insertvalue { ptr addrspace(3), ptr addrspace(3), i64, [1 x i64], [1 x i64] } %3, i64 0, 2 %5 = insertvalue { ptr addrspace(3), ptr addrspace(3), i64, [1 x i64], [1 x i64] } %4, i64 1, 3, 0 %6 = insertvalue { ptr addrspace(3), ptr addrspace(3), i64, [1 x i64], [1 x i64] } %5, i64 1, 4, 0 %7 = extractvalue { ptr addrspace(3), ptr addrspace(3), i64, [1 x i64], [1 x i64] } %6, 1 call void @llvm.nvvm.mbarrier.init.shared(ptr addrspace(3) %7, i32 128) ret void } Despite generating LLVM IR with the expected alloca having addrspace(3), the PTX showed that the value was allocated on the stack instead of the shared memory. See the PTX below: .visible .func do_alloca() { .local .align 8 .b8 __local_depot0[8]; .reg .b32 %SP; .reg .b32 %SPL; .reg .b32 %r<3>; mov.u32 %SPL, __local_depot0; cvta.local.u32 %SP, %SPL; mov.u32 %r1, 128; add.u32 %r2, %SP, 0; mbarrier.init.shared.b64 [%r2], %r1; ret; // -- End function } To address this, I examined how clang implements static shared memory and discovered that it generates a global value with an `internal` linkage type. Thus, I modified the my IR accordingly. As a result, I achieved the desired PTX output, where my barrier object is generated in the shared memory as `.shared .align 8 .b8 __mbarrier[8];`. @__mbarrier = internal addrspace(3) global [1 x i64] undef, align 8 define void @do_global() { call void @llvm.nvvm.mbarrier.init.shared(ptr addrspace(3) @__mbarrier, i32 128) ret void } PTX: .visible .func do_global() // @do_global { .reg .b32 %r<3>; .shared .align 8 .b8 __mbarrier[8]; mov.u32 %r1, __mbarrier; mov.u32 %r2, 128; mbarrier.init.shared.b64 [%r1], %r2; ret; // -- End function } guraypp: Initially, I attempted to use `memref.alloca` with address space 3, but it resulted in an…
		gurayppAuthorUnsubmitted Done Reply Inline Actions Speaking of LLVM's `internal` linkage type, I could not find a way to set internal linkage type on `memref::GlobalOp`. I put a PR that attempts to add `internal` linkage type. https://reviews.llvm.org/D154074 guraypp: Speaking of LLVM's `internal` linkage type, I could not find a way to set internal linkage type…
		auto global = rewriter.create<memref::GlobalOp>(
		funcOp->getLoc(), "__mbarrier",
		/sym_visibility=/rewriter.getStringAttr("private"),
		/type=/barrierType,
		/initial_value=/ElementsAttr(),
		/constant=/false,
		/alignment=/rewriter.getI64IntegerAttr(8));
		symbolTable.insert(global);
		return global;
		}

		LogicalResult
		matchAndRewrite(nvgpu::MBarrierCreateOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override {
		Operation *funcOp = op->getParentOp();
		Operation *mOp = funcOp->getParentOp();
		MemRefType barrierType =
		createMBarrierMemrefType(rewriter, op.getBarrier().getType());

		memref::GlobalOp global;
		if (auto moduleOp = dyn_cast<gpu::GPUModuleOp>(mOp))
		global = generateGlobalBarrier(rewriter, funcOp, moduleOp, barrierType);
		else if (auto moduleOp = dyn_cast<ModuleOp>(mOp))
		global = generateGlobalBarrier(rewriter, funcOp, moduleOp, barrierType);

		rewriter.setInsertionPoint(op);
		rewriter.replaceOpWithNewOp<memref::GetGlobalOp>(op, barrierType,
		global.getName());
		return success();
		}
		};

		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions comments here and below for lowering please. nicolasvasilache: comments here and below for lowering please.
		/// Lowers `nvgpu.mbarrier.init` to `nvvm.mbarrier.init`
		struct NVGPUMBarrierInitLowering
		: public ConvertOpToLLVMPattern<nvgpu::MBarrierInitOp> {
		using ConvertOpToLLVMPattern<nvgpu::MBarrierInitOp>::ConvertOpToLLVMPattern;

		LogicalResult
		matchAndRewrite(nvgpu::MBarrierInitOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override {
		rewriter.setInsertionPoint(op);
		Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
		op.getBarrier(), adaptor.getBarrier());

		Value count = adaptor.getCount();
		if (!adaptor.getCount().getType().isInteger(32)) {
		count = rewriter.create<LLVM::TruncOp>(op->getLoc(),
		rewriter.getI32Type(), count);
		}

		if (isMbarrierShared(op.getBarrier().getType())) {
		rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(op, barrier,
		count);
		} else {
		rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count);
		}
		return success();
		}
		};

		/// Lowers `nvgpu.mbarrier.arrive` to `nvvm.mbarrier.arrive`
		struct NVGPUMBarrierArriveLowering
		: public ConvertOpToLLVMPattern<nvgpu::MBarrierArriveOp> {
		using ConvertOpToLLVMPattern<nvgpu::MBarrierArriveOp>::ConvertOpToLLVMPattern;
		LogicalResult
		matchAndRewrite(nvgpu::MBarrierArriveOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override {
		Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
		op.getBarrier(), adaptor.getBarrier());
		Type tokenType = getTypeConverter()->convertType(
		nvgpu::MBarrierTokenType::get(op->getContext()));
		if (isMbarrierShared(op.getBarrier().getType())) {
		rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveSharedOp>(op, tokenType,
		barrier);
		} else {
		rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType,
		barrier);
		}
		return success();
		}
		};

		/// Lowers `nvgpu.mbarrier.arrive.nocomplete` to
		/// `nvvm.mbarrier.arrive.nocomplete`
		struct NVGPUMBarrierArriveNoCompleteLowering
		: public ConvertOpToLLVMPattern<nvgpu::MBarrierArriveNoCompleteOp> {
		using ConvertOpToLLVMPattern<
		nvgpu::MBarrierArriveNoCompleteOp>::ConvertOpToLLVMPattern;

		LogicalResult
		matchAndRewrite(nvgpu::MBarrierArriveNoCompleteOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override {
		Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
		op.getBarrier(), adaptor.getBarrier());
		Type tokenType = getTypeConverter()->convertType(
		nvgpu::MBarrierTokenType::get(op->getContext()));
		Value count = adaptor.getCount();
		if (!adaptor.getCount().getType().isInteger(32)) {
		count = rewriter.create<LLVM::TruncOp>(op->getLoc(),
		rewriter.getI32Type(), count);
		}
		if (isMbarrierShared(op.getBarrier().getType())) {
		rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>(
		op, tokenType, barrier, count);
		} else {
		rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>(
		op, tokenType, barrier, count);
		}
		return success();
		}
		};

		/// Lowers `nvgpu.mbarrier.test.wait` to `nvvm.mbarrier.test.wait`
		struct NVGPUMBarrierTestWaitLowering
		: public ConvertOpToLLVMPattern<nvgpu::MBarrierTestWaitOp> {
		using ConvertOpToLLVMPattern<
		nvgpu::MBarrierTestWaitOp>::ConvertOpToLLVMPattern;

		LogicalResult
		matchAndRewrite(nvgpu::MBarrierTestWaitOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override {
		Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
		op.getBarrier(), adaptor.getBarrier());
		Type retType = rewriter.getI1Type();
		if (isMbarrierShared(op.getBarrier().getType())) {
		rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitSharedOp>(
		op, retType, barrier, adaptor.getToken());
		} else {
		rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>(
		op, retType, barrier, adaptor.getToken());
		}
		return success();
		}
		};

} // namespace		} // namespace

void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,		void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns) {		RewritePatternSet &patterns) {
patterns.add<MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,		patterns.add<
		NVGPUMBarrierCreateLowering, // nvgpu.mbarrier.create
		NVGPUMBarrierInitLowering, // nvgpu.mbarrier.init
		NVGPUMBarrierArriveLowering, // nvgpu.mbarrier.arrive
		NVGPUMBarrierArriveNoCompleteLowering, // nvgpu.mbarrier.arrive.no_complete
		NVGPUMBarrierTestWaitLowering, // nvgpu.try_wait_parity
		MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,
NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,		NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,
NVGPUMmaSparseSyncLowering>(converter);		NVGPUMmaSparseSyncLowering>(converter);
}		}

mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp

	Show All 28 Lines
	#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.cpp.inc"			#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.cpp.inc"
	>();			>();
	addOperations<			addOperations<
	#define GET_OP_LIST			#define GET_OP_LIST
	#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"			#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"
	>();			>();
	}			}

	bool nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {			bool nvgpu::NVGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
	Attribute memorySpace = type.getMemorySpace();
	if (!memorySpace)			if (!memorySpace)
	return false;			return false;
	if (auto intAttr = llvm::dyn_cast<IntegerAttr>(memorySpace))			if (auto intAttr = llvm::dyn_cast<IntegerAttr>(memorySpace))
	return intAttr.getInt() == NVGPUDialect::kSharedMemoryAddressSpace;			return intAttr.getInt() == NVGPUDialect::kSharedMemoryAddressSpace;
	if (auto gpuAttr = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))			if (auto gpuAttr = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
	return gpuAttr.getValue() == gpu::AddressSpace::Workgroup;			return gpuAttr.getValue() == gpu::AddressSpace::Workgroup;
	return false;			return false;
	}			}

				bool nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {
				Attribute memorySpace = type.getMemorySpace();
				return isSharedMemoryAddressSpace(memorySpace);
				}

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// NVGPU_DeviceAsyncCopyOp			// NVGPU_DeviceAsyncCopyOp
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	/// Return true if the last dimension of the MemRefType has unit stride. Also			/// Return true if the last dimension of the MemRefType has unit stride. Also
	/// return true for memrefs with no strides.			/// return true for memrefs with no strides.
	static bool isLastMemrefDimUnitStride(MemRefType type) {			static bool isLastMemrefDimUnitStride(MemRefType type) {
	int64_t offset;			int64_t offset;
	▲ Show 20 Lines • Show All 271 Lines • Show Last 20 Lines

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

Show First 20 Lines • Show All 497 Lines • ▼ Show 20 Lines	func.func @mma_sp_sync_i8_16864(%arg0: vector<4x4xi8>,
// CHECK-SAME: "=r,=r,=r,=r,r,r,r,r,r,r,r,r,r,r,r,r,r"		// CHECK-SAME: "=r,=r,=r,=r,r,r,r,r,r,r,r,r,r,r,r,r,r"
// CHECK-SAME: %[[sparseMetadata]] :		// CHECK-SAME: %[[sparseMetadata]] :
// CHECK-SAME: -> !llvm.struct<(i32, i32, i32, i32)		// CHECK-SAME: -> !llvm.struct<(i32, i32, i32, i32)

%d = nvgpu.mma.sp.sync(%arg0, %arg1, %arg2) metadata(%arg3) {mmaShape = [16, 8, 64]} :		%d = nvgpu.mma.sp.sync(%arg0, %arg1, %arg2) metadata(%arg3) {mmaShape = [16, 8, 64]} :
(vector<4x4xi8>, vector<4x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>		(vector<4x4xi8>, vector<4x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
return %d : vector<2x2xi32>		return %d : vector<2x2xi32>
}		}

		// -----
		!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
		!tokenType = !nvgpu.mbarrier.token

		// CHECK-LABEL: func @mbarrier
		func.func @mbarrier() {
		%num_threads = arith.constant 128 : index

		// CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier : memref<1xi64, 3>
		%barrier = nvgpu.mbarrier.create -> !barrierType

		// CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
		// CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
		// CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
		nvgpu.mbarrier.init %barrier, %num_threads : !barrierType

		// CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
		// CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.shared %[[barPtr2]]
		%token = nvgpu.mbarrier.arrive %barrier : !barrierType -> !tokenType

		// CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
		// CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]]
		%isDone = nvgpu.mbarrier.test.wait %barrier, %token : !barrierType, !tokenType

		func.return
		}

		// -----
		!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
		!tokenType = !nvgpu.mbarrier.token

		// CHECK-LABEL: func @mbarrier_nocomplete
		func.func @mbarrier_nocomplete() {
		%num_threads = arith.constant 128 : index
		%count = arith.constant 12 : index

		// CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier : memref<1xi64, 3>
		%barrier = nvgpu.mbarrier.create -> !barrierType

		// CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
		// CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
		// CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
		nvgpu.mbarrier.init %barrier, %num_threads : !barrierType

		// CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
		// CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete.shared %[[barPtr2]]
		%token = nvgpu.mbarrier.arrive.nocomplete %barrier, %count : !barrierType -> !tokenType

		// CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
		// CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]]
		%isDone = nvgpu.mbarrier.test.wait %barrier, %token : !barrierType, !tokenType

		func.return
		}
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions nit: nl nicolasvasilache: nit: nl

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][nvgpu] Add initial support for `mbarrier`
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 539126

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td

mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][nvgpu] Add initial support for `mbarrier`ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 539126

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td

mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

[mlir][nvgpu] Add initial support for `mbarrier`
ClosedPublic