This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
mlir/
-
lib/Dialect/GPU/TransformOps/
-
Dialect/
-
GPU/
-
TransformOps/
-
GPUTransformOps.cpp
-
test/Dialect/GPU/
-
Dialect/
-
GPU/
-
transform-gpu-failing.mlir

Differential D135566

[mlir][transform] Fail if thre is no `thread_dim_mapping`
Needs ReviewPublic

Authored by guraypp on Oct 10 2022, 2:11 AM.

Download Raw Diff

Details

Reviewers

nicolasvasilache
bondhugula
ThomasRaoux
herhut

Summary

map_nested_foreach_to_threads has implicit loop mapping; for details, see below. While it's correct, it does not make use of warp programming.

for(i)    --> threadIdx.x
 for(j)   --> threadIdx.y
  for(k)  --> threadIdx.z

In certain circumstances, the compiler can automatically determine the optimal loop mapping, but not always. We anticipate explicit mapping from the user until we find a fully performant solution. So this revision adds failiure if there is no thread_dim_mapping.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

guraypp created this revision.Oct 10 2022, 2:11 AM

Herald added a reviewer: bondhugula. · View Herald TranscriptOct 10 2022, 2:11 AM

Herald added a reviewer: ThomasRaoux. · View Herald Transcript

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: zero9178, bzcheeseman, sdasgup3 and 20 others. · View Herald Transcript

guraypp requested review of this revision.Oct 10 2022, 2:11 AM

Herald added a reviewer: herhut. · View Herald TranscriptOct 10 2022, 2:11 AM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added a subscriber: stephenneuendorffer. · View Herald Transcript

Harbormaster completed remote builds in B191238: Diff 466456.Oct 10 2022, 2:26 AM

Revision Contents

Path

Size

mlir/

lib/

Dialect/

GPU/

TransformOps/

GPUTransformOps.cpp

4 lines

test/

Dialect/

GPU/

transform-gpu-failing.mlir

35 lines

Diff 466456

mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp

Show First 20 Lines • Show All 330 Lines • ▼ Show 20 Lines	static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
if (foreachThreadOp.getNumResults() > 0)		if (foreachThreadOp.getNumResults() > 0)
return failureHelper(		return failureHelper(
"only bufferized scf.foreach_thread lowers to gpu.thread_id");		"only bufferized scf.foreach_thread lowers to gpu.thread_id");

if (foreachThreadOp.getNumThreads().size() > 3)		if (foreachThreadOp.getNumThreads().size() > 3)
return failureHelper(		return failureHelper(
"scf.foreach_thread with rank > 3 does not lower to gpu.thread_id");		"scf.foreach_thread with rank > 3 does not lower to gpu.thread_id");

		if (foreachThreadOp.getThreadDimMapping().empty())
		return failureHelper(
		"scf.foreach_thread is missing thread_dim_mapping attribute.");

auto potentialBlockDim = foreachThreadOp.getPermutedNumThreads(rewriter);		auto potentialBlockDim = foreachThreadOp.getPermutedNumThreads(rewriter);
if (failed(potentialBlockDim) \|\|		if (failed(potentialBlockDim) \|\|
llvm::any_of(*potentialBlockDim, [](OpFoldResult ofr) {		llvm::any_of(*potentialBlockDim, [](OpFoldResult ofr) {
return !getConstantIntValue(ofr).has_value();		return !getConstantIntValue(ofr).has_value();
})) {		})) {
return failureHelper("unsupported dynamic blockdim size");		return failureHelper("unsupported dynamic blockdim size");
}		}

▲ Show 20 Lines • Show All 162 Lines • Show Last 20 Lines

mlir/test/Dialect/GPU/transform-gpu-failing.mlir

	Show First 20 Lines • Show All 282 Lines • ▼ Show 20 Lines
	^bb0(%arg0: !pdl.operation):			^bb0(%arg0: !pdl.operation):
	%funcop = transform.structured.match ops{["func.func"]} in %arg0			%funcop = transform.structured.match ops{["func.func"]} in %arg0
	// expected-error @below {{Trying to launch a GPU kernel with gridDim = (65535, 65535, 1) blockDim = (1, 1, 1). It is larger than the limits.}}			// expected-error @below {{Trying to launch a GPU kernel with gridDim = (65535, 65535, 1) blockDim = (1, 1, 1). It is larger than the limits.}}
	%1 = transform.gpu.map_foreach_to_blocks %funcop { generate_gpu_launch }			%1 = transform.gpu.map_foreach_to_blocks %funcop { generate_gpu_launch }
	}			}

	// -----			// -----


				// -----

				!type = memref<2 x 32 x f32>
				!type1d = memref<32 x f32>

				// CHECK-LABEL: func.func @saxpy2d_no_barrier(
				func.func @no_thread_map(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
				%one = arith.constant 1 : index
				%c12 = arith.constant 12 : index
				%c9 = arith.constant 9 : index
				%c7 = arith.constant 7 : index
				%name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
				threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
				{
				scf.foreach_thread (%i, %j) in (%c7, %c9) {
				%4 = memref.load %x[%i, %j] : !type
				%5 = memref.load %y[%i, %j] : !type
				%6 = math.fma %alpha, %4, %5 : f32
				memref.store %6, %y[%i, %j] : !type
				}
				gpu.terminator
				}
				return %y : !type
				}

				transform.with_pdl_patterns {
				^bb0(%arg0: !pdl.operation):
				transform.sequence %arg0 failures(propagate) {
				^bb1(%arg1: !pdl.operation):
				%funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
				// expected-error @below {{scf.foreach_thread is missing thread_dim_mapping attribute.}}
				transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1] }
				}
				}
				No newline at end of file