This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
mlir/
-
lib/Dialect/GPU/TransformOps/
-
Dialect/
-
GPU/
-
TransformOps/
3/3
GPUTransformOps.cpp
-
test/Dialect/GPU/
-
Dialect/
-
GPU/
1/1
transform-gpu.mlir

Differential D135252

[mlir][transform] Assing blockDim automatically
Needs ReviewPublic

Authored by guraypp on Oct 5 2022, 3:14 AM.

Download Raw Diff

Details

Reviewers

ftynse
bondhugula
ThomasRaoux
herhut
nicolasvasilache

Summary

Current map_nested_foreach_to_threads op expects blockDim argument to
be present. It is tedious to provide it.

This revision automatically sets blockDim if all the 'scf.foreach thread'
trip counts are known at compile-time. It traverses all sibling
scf.foreach_thread ops, finds the largest number of trips in the same
level. Then it assigns the largest trip count to blockDim.

For the example shown below, it sets blockDim = [12, 9, 1] that is x, y and z dimensions respectively.

gpu.launch() {
  scf.foreach_thread (%i, %j) in (7,9)  --> parallelized threadIdx.x + threadIdx.y
  scf.foreach_thread (%i) in (12)       --> parallelized threadIdx.x 
}

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

guraypp created this revision.Oct 5 2022, 3:14 AM

Herald added a reviewer: bondhugula. · View Herald TranscriptOct 5 2022, 3:14 AM

Herald added a reviewer: ThomasRaoux. · View Herald Transcript

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: zero9178, bzcheeseman, sdasgup3 and 20 others. · View Herald Transcript

guraypp requested review of this revision.Oct 5 2022, 3:14 AM

Herald added a reviewer: herhut. · View Herald TranscriptOct 5 2022, 3:14 AM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added a subscriber: stephenneuendorffer. · View Herald Transcript

Harbormaster completed remote builds in B190439: Diff 465329.Oct 5 2022, 3:29 AM

guraypp edited the summary of this revision. (Show Details)Oct 5 2022, 4:35 AM

nicolasvasilache requested changes to this revision.Oct 5 2022, 4:54 AM

nicolasvasilache added inline comments.

mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
45 ↗	(On Diff #465329)	this is a bad API smell, please split out the part that assigns from the part that uses and remains const
mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
493–505	default computation needs to happen here, not hidden under the map function
504–505	you're significantly changing the design of the op, this needs serious documentation
mlir/test/Dialect/GPU/transform-gpu.mlir
205	nl

This revision now requires changes to proceed.Oct 5 2022, 4:54 AM

address nicolasvasilache comments

guraypp marked 3 inline comments as done.Oct 5 2022, 6:10 AM

guraypp added inline comments.

mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
493–505	I moved the calculation here as you suggested. Now I need to replicate this part in other places that call the map function, actually I hid it intentionally inside the map to avoid replication. But I understand your concern.

Harbormaster completed remote builds in B190464: Diff 465362.Oct 5 2022, 6:25 AM

rebase and ping

Herald added a subscriber: Moerafaat. · View Herald TranscriptNov 4 2022, 7:43 AM

update the test

Harbormaster completed remote builds in B196145: Diff 473241.Nov 4 2022, 8:08 AM

nicolasvasilache resigned from this revision.Feb 15 2023, 8:59 AM

Herald added subscribers: nicolasvasilache, thopre. · View Herald TranscriptFeb 15 2023, 8:59 AM

Revision Contents

Path

Size

mlir/

lib/

Dialect/

GPU/

TransformOps/

GPUTransformOps.cpp

72 lines

test/

Dialect/

GPU/

transform-gpu.mlir

40 lines

Diff 473241

mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp

Show First 20 Lines • Show All 141 Lines • ▼ Show 20 Lines	if (blockDimY.has_value())
gpuLaunch.getBlockSizeYMutable().assign(		gpuLaunch.getBlockSizeYMutable().assign(
createConstValue(blockDimY.value()));		createConstValue(blockDimY.value()));
if (blockDimZ.has_value())		if (blockDimZ.has_value())
gpuLaunch.getBlockSizeZMutable().assign(		gpuLaunch.getBlockSizeZMutable().assign(
createConstValue(blockDimZ.value()));		createConstValue(blockDimZ.value()));
return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

		/// Assings potentialBlockDim with the trip count of foreach_thread. If
		/// it has values, assign the larger one.
		static LogicalResult
		setLargestTripCount(RewriterBase &rewriter,
		scf::ForeachThreadOp foreachThreadOp,
		SmallVectorImpl<int64_t> &potentialBlockDim) {
		auto tripCounts = foreachThreadOp.getPermutedNumThreads(rewriter);
		if (failed(tripCounts) \|\| llvm::any_of(*tripCounts, [](OpFoldResult ofr) {
		return !getConstantIntValue(ofr).has_value();
		})) {
		return failure();
		}

		SmallVector<int64_t> tripCountVals =
		llvm::to_vector(llvm::map_range(*tripCounts, [](OpFoldResult ofr) {
		return getConstantIntValue(ofr).value();
		}));

		if (potentialBlockDim.empty()) {
		for (auto dim : tripCountVals)
		potentialBlockDim.push_back(dim);
		} else {
		for (size_t i = 0; i < tripCountVals.size(); ++i)
		potentialBlockDim[i] = std::max(potentialBlockDim[i], tripCountVals[i]);
		}
		return success();
		}

		// Traverses all sibling foreach_thread ops, finds the largest number of
		// trips in the same level
		static LogicalResult setBlockDim(RewriterBase &rewriter, Operation *target,
		SmallVectorImpl<int64_t> &blockDim) {

		auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
		if (failed(setLargestTripCount(rewriter, foreachThreadOp, blockDim)))
		return WalkResult::interrupt();
		return WalkResult::advance();
		});
		return walkResult.wasInterrupted() ? failure() : success();
		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// MapForeachToBlocks		// MapForeachToBlocks
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

DiagnosedSilenceableFailure mlir::transform::gpu::mapForeachToBlocksImpl(		DiagnosedSilenceableFailure mlir::transform::gpu::mapForeachToBlocksImpl(
RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,		RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
function_ref<void(RewriterBase &, scf::ForeachThreadOp,		function_ref<void(RewriterBase &, scf::ForeachThreadOp,
SmallVectorImpl<Value> &)>		SmallVectorImpl<Value> &)>
▲ Show 20 Lines • Show All 285 Lines • ▼ Show 20 Lines	DiagnosedSilenceableFailure transform::MapNestedForeachToThreads::applyToOne(
LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);		LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
auto transformOp = cast<TransformOpInterface>(getOperation());		auto transformOp = cast<TransformOpInterface>(getOperation());

if (!gpuLaunch) {		if (!gpuLaunch) {
results.assign({target});		results.assign({target});
return emitSilenceableError() << "Given target is not gpu.launch";		return emitSilenceableError() << "Given target is not gpu.launch";
}		}

		SimpleRewriter rewriter(getContext());
SmallVector<int64_t> blockDim = extractFromI64ArrayAttr(getBlockDim());		SmallVector<int64_t> blockDim = extractFromI64ArrayAttr(getBlockDim());
blockDim.resize(/size=/3, /value=/1);		if (blockDim.empty() && failed(setBlockDim(rewriter, target, blockDim))) {
		return emitSilenceableError()
		<< "Cannot assign blockDim, trip counts are not "
		"known at compile-time.";
		}

DiagnosedSilenceableFailure diag =		DiagnosedSilenceableFailure diag =
checkGpuLimits(transformOp, llvm::None, llvm::None, llvm::None,		checkGpuLimits(transformOp, llvm::None, llvm::None, llvm::None,
blockDim[0], blockDim[1], blockDim[2]);		blockDim[0], blockDim[1], blockDim[2]);
if (diag.isSilenceableFailure()) {		if (!diag.succeeded()) {
results.assign({target});
diag.attachNote(getLoc()) << getBlockDimAttrName() << " is very large";		diag.attachNote(getLoc()) << getBlockDimAttrName() << " is very large";
return diag;		} else {
		nicolasvasilacheUnsubmitted Done Reply Inline Actions default computation needs to happen here, not hidden under the map function nicolasvasilache: default computation needs to happen here, not hidden under the map function
		gurayppAuthorUnsubmitted Done Reply Inline Actions I moved the calculation here as you suggested. Now I need to replicate this part in other places that call the map function, actually I hid it intentionally inside the map to avoid replication. But I understand your concern. guraypp: I moved the calculation here as you suggested. Now I need to replicate this part in other…
		nicolasvasilacheUnsubmitted Done Reply Inline Actions you're significantly changing the design of the op, this needs serious documentation nicolasvasilache: you're significantly changing the design of the op, this needs serious documentation
}

SimpleRewriter rewriter(getContext());
rewriter.setInsertionPoint(target);		rewriter.setInsertionPoint(target);

diag = mlir::transform::gpu::mapNestedForeachToThreadsImpl(		diag = mlir::transform::gpu::mapNestedForeachToThreadsImpl(
rewriter, target, blockDim, getSyncAfterDistribute(), transformOp);		rewriter, target, blockDim, getSyncAfterDistribute(), transformOp);
if (diag.succeeded()) {		if (diag.succeeded()) {
diag =		diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, llvm::None,
alterGpuLaunch(rewriter, gpuLaunch, transformOp, llvm::None, llvm::None,		llvm::None, llvm::None, blockDim[0], blockDim[1],
llvm::None, blockDim[0], blockDim[1], blockDim[2]);		blockDim[2]);
		}
}		}

results.assign({gpuLaunch});		results.assign({gpuLaunch});
return diag;		return diag;
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Transform op registration		// Transform op registration
Show All 28 Lines

mlir/test/Dialect/GPU/transform-gpu.mlir

Show First 20 Lines • Show All 156 Lines • ▼ Show 20 Lines	// CHECK: return
return %y : !type		return %y : !type
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb1(%arg0: !pdl.operation):		^bb1(%arg0: !pdl.operation):
%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0		%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }		transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }
}		}

		// -----

		!type = memref<2 x 32 x f32>
		!type1d = memref<32 x f32>

		// CHECK-LABEL: func.func @map_nested_foreach_to_threads_without_blockdim(
		func.func @map_nested_foreach_to_threads_without_blockdim(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
		%one = arith.constant 1 : index
		%c12 = arith.constant 12 : index
		%c9 = arith.constant 9 : index
		%c7 = arith.constant 7 : index
		// CHECK: %[[C1:.*]] = arith.constant 1 : index
		// CHECK: %[[C12:.*]] = arith.constant 12 : index
		// CHECK: %[[C7:.*]] = arith.constant 7 : index
		// CHECK: gpu.launch async [%{{.}}] blocks(%{{.}}, %{{.}}, %{{.}}) in (%{{.}} = %[[C1]], %{{.}} = %[[C1]], %{{.}} = %[[C1]]) threads(%{{.}}, %{{.}}, %{{.}}) in (%{{.}} = %[[C12]], %{{.}} = %[[C7]], %{{.*}} = %[[C1]])
		%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
		threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
		{
		scf.foreach_thread (%i, %j) in (%c7, %c9) {
		%4 = memref.load %x[%i, %j] : !type
		%5 = memref.load %y[%i, %j] : !type
		%6 = math.fma %alpha, %4, %5 : f32
		memref.store %6, %y[%i, %j] : !type
		} {thread_dim_mapping = [1, 0, 2]}
		scf.foreach_thread (%i) in (%c12) {
		%7 = memref.load %t[%i] : !type1d
		%8 = arith.addf %alpha, %7 : f32
		memref.store %8, %t[%i] : !type1d
		} {thread_dim_mapping = [0, 1, 2]}
		gpu.terminator
		}
		return %y : !type
		}

		transform.sequence failures(propagate) {
		^bb1(%arg0: !pdl.operation):
		%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
		transform.gpu.map_nested_foreach_to_threads %funcop
		}
		nicolasvasilacheUnsubmitted Done Reply Inline Actions nl nicolasvasilache: nl