This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
mlir/
-
lib/Dialect/NVGPU/Transforms/
-
Dialect/
-
NVGPU/
-
Transforms/
-
CreateAsyncGroups.cpp
-
test/Dialect/NVGPU/
-
Dialect/
-
NVGPU/
1/1
transform-create-async-groups.mlir

Differential D157286

[mlir][NVGPU] Support N-D masks in transform.nvgpu.create_async_groups
ClosedPublic

Authored by springerm on Aug 7 2023, 7:39 AM.

Download Raw Diff

Details

Reviewers

nicolasvasilache
guraypp
herhut

Commits

rG15ea2306a41a: [mlir][NVGPU] Support N-D masks in transform.nvgpu.create_async_groups

Summary

Support IR that is generated by the vector-to-scf lowering of N-D vector transfers with a mask. (Until now only 1-D and 2-D transfers were supported.) Only transfers that were fully unrolled are supported.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

springerm created this revision.Aug 7 2023, 7:39 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 7 2023, 7:39 AM

Herald added subscribers: bviyer, Moerafaat, zero9178 and 23 others. · View Herald Transcript

springerm requested review of this revision.Aug 7 2023, 7:39 AM

Herald added a reviewer: herhut. · View Herald TranscriptAug 7 2023, 7:39 AM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added a subscriber: stephenneuendorffer. · View Herald Transcript

springerm added inline comments.Aug 7 2023, 7:40 AM

mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir
168–171	These values are dead (have no uses). No need to match them.

Harbormaster completed remote builds in B250789: Diff 547785.Aug 7 2023, 10:03 AM

Looks good to me.

Just a quick suggestion – what do we think about adding an end-to-end integration test for this Op? Recently, @nicolasvasilache introduced the test-lower-to-nvvm pass which could be really helpful.

This revision is now accepted and ready to land.Aug 8 2023, 2:47 AM

In D157286#4568853, @guraypp wrote:

Looks good to me.

Just a quick suggestion – what do we think about adding an end-to-end integration test for this Op? Recently, @nicolasvasilache introduced the test-lower-to-nvvm pass which could be really helpful.

Good idea. I'm going to add copy_async to D156371 and also add an integration test to that.

Closed by commit rG15ea2306a41a: [mlir][NVGPU] Support N-D masks in transform.nvgpu.create_async_groups (authored by springerm). · Explain WhyAug 8 2023, 5:36 AM

This revision was automatically updated to reflect the committed changes.

springerm added a commit: rG15ea2306a41a: [mlir][NVGPU] Support N-D masks in transform.nvgpu.create_async_groups.

Revision Contents

Path

Size

mlir/

lib/

Dialect/

NVGPU/

Transforms/

CreateAsyncGroups.cpp

42 lines

test/

Dialect/

NVGPU/

transform-create-async-groups.mlir

65 lines

Diff 548169

mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp

Show First 20 Lines • Show All 62 Lines • ▼ Show 20 Lines	static FailureOr<TransferMask> getMaskOp(Operation *loadOp) {
assert(transferRead.getMask().getType().getRank() == 1 &&		assert(transferRead.getMask().getType().getRank() == 1 &&
"expected 1-D mask");		"expected 1-D mask");

// Case 1: Mask is the result of a vector.create_mask.		// Case 1: Mask is the result of a vector.create_mask.
if (auto maskOp =		if (auto maskOp =
transferRead.getMask().getDefiningOp<vector::CreateMaskOp>())		transferRead.getMask().getDefiningOp<vector::CreateMaskOp>())
return TransferMask{maskOp, {}};		return TransferMask{maskOp, {}};

// Case 2: Mask is the result of a vector.extract(vector.create_mask). Only		// Case 2: Mask is the result of a vector.extract(vector.create_mask).
// 2D -> 1D extracts are supported at the moment.
if (auto extractOp =		if (auto extractOp =
transferRead.getMask().getDefiningOp<vector::ExtractOp>())		transferRead.getMask().getDefiningOp<vector::ExtractOp>())
if (auto maskOp =		if (auto maskOp =
extractOp.getVector().getDefiningOp<vector::CreateMaskOp>())		extractOp.getVector().getDefiningOp<vector::CreateMaskOp>())
if (extractOp.getPosition().size() == 1 &&
extractOp.getSourceVectorType().getRank() == 2)
return TransferMask{maskOp,		return TransferMask{maskOp,
SmallVector<int64_t>(extractOp.getPosition())};		SmallVector<int64_t>(extractOp.getPosition())};

// All other cases: not supported.		// All other cases: not supported.
return {};		return failure();
}		}

/// Build an SSA value that represents the number of read elements.		/// Build an SSA value that represents the number of read elements.
static Value buildNumReadElements(OpBuilder &b, Location loc,		static Value buildNumReadElements(OpBuilder &b, Location loc,
Operation *readOp) {		Operation *readOp) {
FailureOr<TransferMask> transferMask = getMaskOp(readOp);		FailureOr<TransferMask> transferMask = getMaskOp(readOp);
assert(succeeded(transferMask) && "invalid transfer mask");		assert(succeeded(transferMask) && "invalid transfer mask");

// No mask => no num_read_elements.		// No mask => no num_read_elements.
if (!transferMask->createMaskOp)		if (!transferMask->createMaskOp)
return Value();		return Value();

// No extract: return size of "ones" segment in the mask.		// No extract: return size of "ones" segment in the mask.
if (transferMask->extractPosition.empty()) {		if (transferMask->extractPosition.empty()) {
assert(transferMask->createMaskOp.getNumOperands() == 1 &&		assert(transferMask->createMaskOp.getNumOperands() == 1 &&
"expected single operand");		"expected single operand");
return transferMask->createMaskOp.getOperand(0);		return transferMask->createMaskOp.getOperand(0);
}		}

// vector.extract(vector.create_mask).		// vector.extract(vector.create_mask).
// If extract_pos < num_ones, take number of elements from the least		// If extract_pos < num_ones, take number of elements from the least
// significant dimension.		// significant dimension. (Do this for all dimensions and bit-AND the
assert(transferMask->createMaskOp.getVectorType().getRank() == 2 &&		// conditions.)
"expected 2D mask");		assert(transferMask->createMaskOp.getVectorType().getRank() -
assert(transferMask->extractPosition.size() == 1 &&		transferMask->extractPosition.size() ==
"expected 2D->1D extract");		1 &&
Value cmp = b.create<arith::CmpIOp>(		"expected N-D -> (N-1)-D extract");
loc, arith::CmpIPredicate::slt,		Value cond;
b.create<arith::ConstantIndexOp>(loc,		// Note: There is one more `sz` than `pos`. The loop end with the last `pos`.
transferMask->extractPosition.front()),		for (auto [pos, sz] : llvm::zip(transferMask->extractPosition,
transferMask->createMaskOp->getOperands().front());		transferMask->createMaskOp->getOperands())) {
		Value cmp =
		b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
		b.create<arith::ConstantIndexOp>(loc, pos), sz);
		if (!cond) {
		cond = cmp;
		continue;
		}
		cond = b.create<arith::AndIOp>(loc, cmp, cond);
		}
return b.create<arith::SelectOp>(		return b.create<arith::SelectOp>(
loc, cmp, transferMask->createMaskOp->getOperands().back(),		loc, cond, transferMask->createMaskOp->getOperands().back(),
b.create<arith::ConstantIndexOp>(loc, 0));		b.create<arith::ConstantIndexOp>(loc, 0));
}		}

/// Return "true" if the conversion to async copy is supported by "async copy".		/// Return "true" if the conversion to async copy is supported by "async copy".
static bool resultsInSupportedAsyncCopy(MemRefType memrefType,		static bool resultsInSupportedAsyncCopy(MemRefType memrefType,
VectorType vecType) {		VectorType vecType) {
assert(vecType.getRank() == 1 && "expected 1-D vector");		assert(vecType.getRank() == 1 && "expected 1-D vector");
constexpr int64_t kSupportedCpAsyncAlignmentsInBytes[3] = {4, 8, 16};		constexpr int64_t kSupportedCpAsyncAlignmentsInBytes[3] = {4, 8, 16};
▲ Show 20 Lines • Show All 147 Lines • Show Last 20 Lines

mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir

Show First 20 Lines • Show All 159 Lines • ▼ Show 20 Lines	builtin.module {
// CHECK-SAME: %[[sz0:.]]: index, %[[sz1:.]]: index, %[[a:.*]]: memref<1024x1024xf32>		// CHECK-SAME: %[[sz0:.]]: index, %[[sz1:.]]: index, %[[a:.*]]: memref<1024x1024xf32>
func.func @read_2d_with_mask(%sz0: index, %sz1: index, %a: memref<1024x1024xf32>) {		func.func @read_2d_with_mask(%sz0: index, %sz1: index, %a: memref<1024x1024xf32>) {
// CHECK: %[[c0:.*]] = arith.constant 0 : index		// CHECK: %[[c0:.*]] = arith.constant 0 : index
// CHECK: %[[c1:.*]] = arith.constant 1 : index		// CHECK: %[[c1:.*]] = arith.constant 1 : index
// CHECK: %[[c2:.*]] = arith.constant 2 : index		// CHECK: %[[c2:.*]] = arith.constant 2 : index
%0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>		%0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
%c0 = arith.constant 0 : index		%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f32		%cst_0 = arith.constant 0.000000e+00 : f32
// CHECK: %[[mask:.*]] = vector.create_mask
// CHECK: %[[e0:.*]] = vector.extract %[[mask]][0] : vector<3x4xi1>
// CHECK: %[[e1:.*]] = vector.extract %[[mask]][1] : vector<3x4xi1>
// CHECK: %[[e2:.*]] = vector.extract %[[mask]][2] : vector<3x4xi1>
springermAuthorUnsubmitted Done Reply Inline Actions These values are dead (have no uses). No need to match them. springerm: These values are dead (have no uses). No need to match them.

// CHECK: %[[cmpi0:.*]] = arith.cmpi slt, %[[c0]], %[[sz0]]		// CHECK: %[[cmpi0:.*]] = arith.cmpi slt, %[[c0]], %[[sz0]]
// CHECK: %[[s0:.*]] = arith.select %[[cmpi0]], %[[sz1]], %[[c0]]		// CHECK: %[[s0:.*]] = arith.select %[[cmpi0]], %[[sz1]], %[[c0]]
// CHECK: nvgpu.device_async_copy %[[a]][%[[c0]], %[[c0]]], {{.*}}, 4, %[[s0]] {bypassL1}		// CHECK: nvgpu.device_async_copy %[[a]][%[[c0]], %[[c0]]], {{.*}}, 4, %[[s0]] {bypassL1}

// CHECK: %[[cmpi1:.*]] = arith.cmpi slt, %[[c1]], %[[sz0]]		// CHECK: %[[cmpi1:.*]] = arith.cmpi slt, %[[c1]], %[[sz0]]
// CHECK: %[[s1:.*]] = arith.select %[[cmpi1]], %[[sz1]], %[[c0]]		// CHECK: %[[s1:.*]] = arith.select %[[cmpi1]], %[[sz1]], %[[c0]]
// CHECK: nvgpu.device_async_copy %[[a]][%[[c1]], %[[c0]]], {{.*}}, 4, %[[s1]] {bypassL1}		// CHECK: nvgpu.device_async_copy %[[a]][%[[c1]], %[[c0]]], {{.*}}, 4, %[[s1]] {bypassL1}
Show All 14 Lines	^bb1(%variant_op: !transform.any_op):
transform.apply_patterns to %top_level_func {		transform.apply_patterns to %top_level_func {
transform.apply_patterns.vector.transfer_to_scf max_transfer_rank = 1 full_unroll = true		transform.apply_patterns.vector.transfer_to_scf max_transfer_rank = 1 full_unroll = true
} : !transform.any_op		} : !transform.any_op
transform.nvgpu.create_async_groups %top_level_func {bypass_l1} : (!transform.any_op) -> (!transform.any_op)		transform.nvgpu.create_async_groups %top_level_func {bypass_l1} : (!transform.any_op) -> (!transform.any_op)
%top_level_func_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op		%top_level_func_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.apply_cse to %top_level_func_2 : !transform.any_op		transform.apply_cse to %top_level_func_2 : !transform.any_op
}		}
}		}

		// -----

		// 3D vector.transfer_read with a mask.
		builtin.module {
		// CHECK-LABEL: @read_3d_with_mask(
		// CHECK-SAME: %[[sz0:.]]: index, %[[sz1:.]]: index, %[[sz2:.]]: index, %[[a:.]]: memref<1024x1024x1024xf32>
		func.func @read_3d_with_mask(%sz0: index, %sz1: index, %sz2: index, %a: memref<1024x1024x1024xf32>) {
		// CHECK: %[[c0:.*]] = arith.constant 0 : index
		// CHECK: %[[c1:.*]] = arith.constant 1 : index
		// CHECK: %[[c2:.*]] = arith.constant 2 : index
		%0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
		%c0 = arith.constant 0 : index
		%cst_0 = arith.constant 0.000000e+00 : f32

		// CHECK: %[[cmpi0:.*]] = arith.cmpi slt, %[[c0]], %[[sz0]]
		// CHECK: %[[cmpi1:.*]] = arith.cmpi slt, %[[c0]], %[[sz1]]
		// CHECK: %[[cond0:.*]] = arith.andi %[[cmpi1]], %[[cmpi0]]
		// CHECK: %[[s0:.*]] = arith.select %[[cond0]], %[[sz2]], %[[c0]]
		// CHECK: nvgpu.device_async_copy %[[a]][%[[c0]], %[[c0]], %[[c0]]], {{.*}}, 4, %[[s0]] {bypassL1}

		// CHECK: %[[cmpi2:.*]] = arith.cmpi slt, %[[c1]], %[[sz1]]
		// CHECK: %[[cond1:.*]] = arith.andi %[[cmpi2]], %[[cmpi0]]
		// CHECK: %[[s1:.*]] = arith.select %[[cond1]], %[[sz2]], %[[c0]]
		// CHECK: nvgpu.device_async_copy %[[a]][%[[c0]], %[[c1]], %[[c0]]], {{.*}}, 4, %[[s1]] {bypassL1}

		// CHECK: %[[cmpi3:.*]] = arith.cmpi slt, %[[c2]], %[[sz1]]
		// CHECK: %[[cond2:.*]] = arith.andi %[[cmpi3]], %[[cmpi0]]
		// CHECK: %[[s2:.*]] = arith.select %[[cond2]], %[[sz2]], %[[c0]]
		// CHECK: nvgpu.device_async_copy %[[a]][%[[c0]], %[[c2]], %[[c0]]], {{.*}}, 4, %[[s2]] {bypassL1}

		// CHECK: %[[cmpi4:.*]] = arith.cmpi slt, %[[c1]], %[[sz0]]
		// CHECK: %[[cond3:.*]] = arith.andi %[[cmpi1]], %[[cmpi4]]
		// CHECK: %[[s3:.*]] = arith.select %[[cond3]], %[[sz2]], %[[c0]]
		// CHECK: nvgpu.device_async_copy %[[a]][%[[c1]], %[[c0]], %[[c0]]], {{.*}}, 4, %[[s3]] {bypassL1}

		// CHECK: %[[cond4:.*]] = arith.andi %[[cmpi2]], %[[cmpi4]]
		// CHECK: %[[s4:.*]] = arith.select %[[cond4]], %[[sz2]], %[[c0]]
		// CHECK: nvgpu.device_async_copy %[[a]][%[[c1]], %[[c1]], %[[c0]]], {{.*}}, 4, %[[s4]] {bypassL1}

		// CHECK: %[[cond5:.*]] = arith.andi %[[cmpi3]], %[[cmpi4]]
		// CHECK: %[[s5:.*]] = arith.select %[[cond5]], %[[sz2]], %[[c0]]
		// CHECK: nvgpu.device_async_copy %[[a]][%[[c1]], %[[c2]], %[[c0]]], {{.*}}, 4, %[[s5]] {bypassL1}
		%mask = vector.create_mask %sz0, %sz1, %sz2 : vector<2x3x4xi1>
		%1 = vector.transfer_read %a[%c0, %c0, %c0], %cst_0, %mask {in_bounds = [true, true, true]} : memref<1024x1024x1024xf32>, vector<2x3x4xf32>
		vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<2x3x4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>

		return
		}

		transform.sequence failures(propagate) {
		^bb1(%variant_op: !transform.any_op):
		%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
		transform.apply_patterns to %top_level_func {
		transform.apply_patterns.vector.transfer_to_scf max_transfer_rank = 1 full_unroll = true
		} : !transform.any_op
		transform.nvgpu.create_async_groups %top_level_func {bypass_l1} : (!transform.any_op) -> (!transform.any_op)
		%top_level_func_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
		transform.apply_cse to %top_level_func_2 : !transform.any_op
		}
		}