Diff 457706

mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td

Show First 20 Lines • Show All 145 Lines • ▼ Show 20 Lines	let description = [{
This is memory access will be pending to be added to a group.		This is memory access will be pending to be added to a group.

This op is meant to be used with `gpu.device_async_create_group` and		This op is meant to be used with `gpu.device_async_create_group` and
`gpu.device_async_wait` to synchronize copies as explained in those ops		`gpu.device_async_wait` to synchronize copies as explained in those ops
descriptions.		descriptions.
`bypassL1` attribute is hint to the backend and hardware that		`bypassL1` attribute is hint to the backend and hardware that
the copy should by pass the L1 cache, this may be dropped by the backend or		the copy should by pass the L1 cache, this may be dropped by the backend or
hardware.		hardware.
		`dstElements` attribute is the total number of elements written to
		destination (shared memory).
		`srcElements` argument is the total number of elements read from
		source (global memory).

		srcElements` is an optional argument and when present it only reads
		srcElements number of elements from the source global memory and zero fills
		the rest of the elements in the destination shared memory.

In order to do a copy and wait for the result we need the following		In order to do a copy and wait for the result we need the following
combination:		combination:
```		```
// copy 1.		// copy 1.
%cp1 = gpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>		%cp1 = gpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
// copy 2.		// copy 2.
%cp2 = gpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>		%cp2 = gpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
Show All 16 Lines	%0 = gpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
memref<4x5xf32> to memref<2x7x5xf32, 3>		memref<4x5xf32> to memref<2x7x5xf32, 3>
```		```
}];		}];
let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);		let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
let arguments = (ins Arg<AnyMemRef, "", [MemWrite]>:$dst,		let arguments = (ins Arg<AnyMemRef, "", [MemWrite]>:$dst,
Variadic<Index>:$dstIndices,		Variadic<Index>:$dstIndices,
Arg<AnyMemRef, "", [MemRead]>:$src,		Arg<AnyMemRef, "", [MemRead]>:$src,
Variadic<Index>:$srcIndices,		Variadic<Index>:$srcIndices,
IndexAttr:$numElements,		IndexAttr:$dstElements,
		Optional<Index>:$srcElements,
OptionalAttr<UnitAttr>:$bypassL1);		OptionalAttr<UnitAttr>:$bypassL1);
let assemblyFormat = [{		let assemblyFormat = [{
$src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $numElements		$src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $dstElements (`,` $srcElements^)?
attr-dict `:` type($src) `to` type($dst)		attr-dict `:` type($src) `to` type($dst)
}];		}];
let hasVerifier = 1;		let hasVerifier = 1;
}		}

def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {		def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
let summary = "device side asynchronous create group operation";		let summary = "device side asynchronous create group operation";
let description = [{		let description = [{
▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

Show First 20 Lines • Show All 348 Lines • ▼ Show 20 Lines	void runOnOperation() override {
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();		target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
target.addLegalDialect<::mlir::NVVM::NVVMDialect>();		target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
if (failed(applyPartialConversion(getOperation(), target,		if (failed(applyPartialConversion(getOperation(), target,
std::move(patterns))))		std::move(patterns))))
signalPassFailure();		signalPassFailure();
}		}
};		};

		static void emitCpAsyncOpZfillAsm(Location loc, Value dstPtr, Value srcPtr,
		Value dstBytes, Value srcElements,
		mlir::MemRefType elementType,
		ConversionPatternRewriter &rewriter) {
		auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
		LLVM::AsmDialect::AD_ATT);
		const char *asmStr = "cp.async.cg.shared.global [$0], [$1], $2, $3;\n";
		const char *asmConstraints = "r,l,n,r";

		Value c3I32 = rewriter.create<LLVM::ConstantOp>(
		loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(3));
		Value bitwidth = rewriter.create<LLVM::ConstantOp>(
		loc, rewriter.getI32Type(),
		rewriter.getI32IntegerAttr(elementType.getElementTypeBitWidth()));
		Value srcElementsI32 =
		rewriter.create<LLVM::TruncOp>(loc, rewriter.getI32Type(), srcElements);
		Value srcBytes = rewriter.create<LLVM::LShrOp>(
		loc, rewriter.create<LLVM::MulOp>(loc, bitwidth, srcElementsI32), c3I32);

		SmallVector<Value> asmVals{dstPtr, srcPtr, dstBytes, srcBytes};

		rewriter.create<LLVM::InlineAsmOp>(
		loc, LLVM::LLVMVoidType::get(rewriter.getContext()), /operands=/asmVals,
		/asm_string=/asmStr,
		/constraints=/asmConstraints, /has_side_effects=/true,
		/is_align_stack=/false, /asm_dialect=/asmDialectAttr,
		/operand_attrs=/ArrayAttr());
		}

struct NVGPUAsyncCopyLowering		struct NVGPUAsyncCopyLowering
: public ConvertOpToLLVMPattern<nvgpu::DeviceAsyncCopyOp> {		: public ConvertOpToLLVMPattern<nvgpu::DeviceAsyncCopyOp> {
using ConvertOpToLLVMPattern<		using ConvertOpToLLVMPattern<
nvgpu::DeviceAsyncCopyOp>::ConvertOpToLLVMPattern;		nvgpu::DeviceAsyncCopyOp>::ConvertOpToLLVMPattern;

LogicalResult		LogicalResult
matchAndRewrite(nvgpu::DeviceAsyncCopyOp op, OpAdaptor adaptor,		matchAndRewrite(nvgpu::DeviceAsyncCopyOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {		ConversionPatternRewriter &rewriter) const override {
Location loc = op->getLoc();		Location loc = op->getLoc();
		ThomasRaouxUnsubmitted Done Reply Inline Actions nit: remove empty line ThomasRaoux: nit: remove empty line
auto dstMemrefType = op.getDst().getType().cast<MemRefType>();		auto dstMemrefType = op.getDst().getType().cast<MemRefType>();
Value dstPtr = getStridedElementPtr(loc, dstMemrefType, adaptor.getDst(),		Value dstPtr = getStridedElementPtr(loc, dstMemrefType, adaptor.getDst(),
adaptor.getDstIndices(), rewriter);		adaptor.getDstIndices(), rewriter);
auto i8Ty = IntegerType::get(op.getContext(), 8);		auto i8Ty = IntegerType::get(op.getContext(), 8);
auto dstPointerType =		auto dstPointerType =
LLVM::LLVMPointerType::get(i8Ty, dstMemrefType.getMemorySpaceAsInt());		LLVM::LLVMPointerType::get(i8Ty, dstMemrefType.getMemorySpaceAsInt());
dstPtr = rewriter.create<LLVM::BitcastOp>(loc, dstPointerType, dstPtr);		dstPtr = rewriter.create<LLVM::BitcastOp>(loc, dstPointerType, dstPtr);

auto srcMemrefType = op.getSrc().getType().cast<MemRefType>();		auto srcMemrefType = op.getSrc().getType().cast<MemRefType>();

Value scrPtr = getStridedElementPtr(loc, srcMemrefType, adaptor.getSrc(),		Value scrPtr = getStridedElementPtr(loc, srcMemrefType, adaptor.getSrc(),
adaptor.getSrcIndices(), rewriter);		adaptor.getSrcIndices(), rewriter);
auto srcPointerType =		auto srcPointerType =
LLVM::LLVMPointerType::get(i8Ty, srcMemrefType.getMemorySpaceAsInt());		LLVM::LLVMPointerType::get(i8Ty, srcMemrefType.getMemorySpaceAsInt());
scrPtr = rewriter.create<LLVM::BitcastOp>(loc, srcPointerType, scrPtr);		scrPtr = rewriter.create<LLVM::BitcastOp>(loc, srcPointerType, scrPtr);
// Intrinsics takes a global pointer so we need an address space cast.		// Intrinsics takes a global pointer so we need an address space cast.
auto srcPointerGlobalType = LLVM::LLVMPointerType::get(		auto srcPointerGlobalType = LLVM::LLVMPointerType::get(
i8Ty, NVVM::NVVMMemorySpace::kGlobalMemorySpace);		i8Ty, NVVM::NVVMMemorySpace::kGlobalMemorySpace);
scrPtr = rewriter.create<LLVM::AddrSpaceCastOp>(loc, srcPointerGlobalType,		scrPtr = rewriter.create<LLVM::AddrSpaceCastOp>(loc, srcPointerGlobalType,
scrPtr);		scrPtr);
int64_t numElements = adaptor.getNumElements().getZExtValue();		int64_t dstElements = adaptor.getDstElements().getZExtValue();
int64_t sizeInBytes =		int64_t sizeInBytes =
(dstMemrefType.getElementTypeBitWidth() * numElements) / 8;		(dstMemrefType.getElementTypeBitWidth() * dstElements) / 8;
// bypass L1 is only supported for byte sizes of 16, we drop the hint		// bypass L1 is only supported for byte sizes of 16, we drop the hint
// otherwise.		// otherwise.
UnitAttr bypassL1 =		UnitAttr bypassL1 =
sizeInBytes == 16 ? adaptor.getBypassL1Attr() : UnitAttr();		sizeInBytes == 16 ? adaptor.getBypassL1Attr() : UnitAttr();
rewriter.create<NVVM::CpAsyncOp>(
loc, dstPtr, scrPtr, rewriter.getI32IntegerAttr(sizeInBytes), bypassL1);		// When the optional SrcElements argument is present, the source (global
		ThomasRaouxUnsubmitted Done Reply Inline Actions I believe this naming is cutlass specific and I don't really see it anywhere in ptx or nvvm spec. I think it would be better to explain the difference between those two different version of cp.async and not name it cp_async_zfill ThomasRaoux: I believe this naming is cutlass specific and I don't really see it anywhere in ptx or nvvm…
		// memory) of CpAsyncOp is read only for SrcElements number of elements. The
		// rest of the DstElements in the destination (shared memory) are filled
		// with zeros.
		if (op.getSrcElements())
		emitCpAsyncOpZfillAsm(loc, dstPtr, scrPtr,
		rewriter.create<LLVM::ConstantOp>(
		loc, rewriter.getI32Type(),
		rewriter.getI32IntegerAttr(sizeInBytes)),
		adaptor.getSrcElements(), srcMemrefType, rewriter);

		// When the optional SrcElements argument is not present, the regular
		// CpAsyncOp is generated. CopyAsyncOp reads bytes from source (global
		// memory) to fill DstElements number of elements in the destination (shared
		// memory).
		else
		rewriter.create<NVVM::CpAsyncOp>(loc, dstPtr, scrPtr,
		rewriter.getI32IntegerAttr(sizeInBytes),
		bypassL1);

// Drop the result token.		// Drop the result token.
Value zero = rewriter.create<LLVM::ConstantOp>(		Value zero = rewriter.create<LLVM::ConstantOp>(
op->getLoc(), IntegerType::get(op.getContext(), 32),		op->getLoc(), IntegerType::get(op.getContext(), 32),
rewriter.getI32IntegerAttr(0));		rewriter.getI32IntegerAttr(0));
rewriter.replaceOp(op, zero);		rewriter.replaceOp(op, zero);
return success();		return success();
}		}
▲ Show 20 Lines • Show All 48 Lines • Show Last 20 Lines

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

Show First 20 Lines • Show All 291 Lines • ▼ Show 20 Lines	func.func @async_cp_i4(
// CHECK-DAG: %[[ADDRESSSRC:.*]] = llvm.getelementptr %[[BASESRC]][%[[FI3]]] : (!llvm.ptr<i4>, i64) -> !llvm.ptr<i4>		// CHECK-DAG: %[[ADDRESSSRC:.*]] = llvm.getelementptr %[[BASESRC]][%[[FI3]]] : (!llvm.ptr<i4>, i64) -> !llvm.ptr<i4>
// CHECK-DAG: %[[CAST1:.*]] = llvm.bitcast %[[ADDRESSSRC]] : !llvm.ptr<i4> to !llvm.ptr<i8>		// CHECK-DAG: %[[CAST1:.*]] = llvm.bitcast %[[ADDRESSSRC]] : !llvm.ptr<i4> to !llvm.ptr<i8>
// CHECK-DAG: %[[CAST2:.*]] = llvm.addrspacecast %[[CAST1]] : !llvm.ptr<i8> to !llvm.ptr<i8, 1>		// CHECK-DAG: %[[CAST2:.*]] = llvm.addrspacecast %[[CAST1]] : !llvm.ptr<i8> to !llvm.ptr<i8, 1>
// CHECK-DAG: nvvm.cp.async.shared.global %[[CAST0]], %[[CAST2]], 16		// CHECK-DAG: nvvm.cp.async.shared.global %[[CAST0]], %[[CAST2]], 16
%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i], 32 : memref<128x64xi4> to memref<128x128xi4, 3>		%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i], 32 : memref<128x64xi4> to memref<128x128xi4, 3>
return %0 : !nvgpu.device.async.token		return %0 : !nvgpu.device.async.token
}		}

		// -----

		// CHECK-LABEL: @async_cp_zfill(
		// CHECK-SAME: %[[IDX:[a-zA-Z0-9_]+]]: index, %[[SRCELEMENTS:[a-zA-Z0-9_]+]]: index)
		func.func @async_cp_zfill(
		%src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index, %srcElements : index) {

		// CHECK-DAG: lvm.inline_asm has_side_effects asm_dialect = att "cp.async.cg.shared.global [$0], [$1], $2, $3;\0A", "r,l,n,r" %[[DSTPTR:.]], %[[SRCPTR:.]], %[[DSTBYTES:.]], %[[SRCBYTES:.]] : (!llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>, i32, i32) -> !llvm.void
		%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4, %srcElements {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3>
		// CHECK: nvvm.cp.async.commit.group
		%1 = nvgpu.device_async_create_group %0
		// CHECK: nvvm.cp.async.wait.group 1
		nvgpu.device_async_wait %1 { numGroups = 1 : i32 }

		return
		}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][NVGPU] Adding Support for cp_async_zfill via Inline Asm
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 457706

mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td

mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][NVGPU] Adding Support for cp_async_zfill via Inline AsmClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 457706

mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td

mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

[mlir][NVGPU] Adding Support for cp_async_zfill via Inline Asm
ClosedPublic