Diff 505447

mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h

	Show All 27 Lines
	#define GET_OP_CLASSES			#define GET_OP_CLASSES
	#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc"			#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc"

	namespace mlir {			namespace mlir {
	class DialectRegistry;			class DialectRegistry;
	namespace transform {			namespace transform {
	namespace gpu {			namespace gpu {

				constexpr int64_t kWarpSize = 32;
				bondhugulaUnsubmitted Done Reply Inline Actions Doc comment. bondhugula: Doc comment.

				/// Helper type for functions that generate ids for the mapping of a
				/// scf.forall.
				using GpuIdBuilderFnType = llvm::function_ref<SmallVector<Value>(
				RewriterBase &, scf::ForallOp, ArrayRef<int64_t> mappingDims)>;

				/// Helper struct for passing the mapping attributes and id generator to the
				/// common forall rewriter.
				struct GpuIdBuilder {
				/// The mapping attributes targeted by this generator.
				SmallVector<DeviceMappingAttrInterface> mappingAttributes;
				/// The constructor that builds the concrete IR for mapping ids.
				GpuIdBuilderFnType idBuilder;
				};

	/// Map the top level `scf.forall` op to GPU Thread Blocks.			/// Map the top level `scf.forall` op to GPU Thread Blocks.
				ThomasRaouxUnsubmitted Done Reply Inline Actions nit: replace `A the` -> `The`? ThomasRaoux: nit: replace `A the` -> `The`?
	/// Mapping is one-to-one and the induction variables of `scf.forall` are			/// Mapping is one-to-one and the induction variables of `scf.forall` are
	/// rewritten to gpu.block_id according to the thread_dim_apping attribute.			/// rewritten to gpu.block_id according to the thread_dim_mapping attribute.
	/// Dynamic, `scf.forall` trip counts are currently not supported.			/// Dynamic, `scf.forall` trip counts are currently not supported.
	/// Dynamic block dim sizes are currently not supported.			/// Dynamic block dim sizes are currently not supported.
	DiagnosedSilenceableFailure mapForallToBlocksImpl(			DiagnosedSilenceableFailure
	RewriterBase &rewriter, TransformOpInterface transformOp,			mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp,
	scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,			scf::ForallOp forallOp,
	const ArrayRef<DeviceMappingAttrInterface> &mappingAttributes,			SmallVectorImpl<int64_t> &gridDims,
	function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>			const GpuIdBuilder &gpuIdBuilder);
	blockIdGenerator);

	/// Search `scf.forall` ops nested under `target` and map each such op to GPU			/// Search `scf.forall` ops nested under `target` and map each such op to GPU
	/// threads. Mapping is one-to-one and the induction variables of `scf.forall`			/// threads. Mapping is one-to-one and the induction variables of `scf.forall`
	/// are rewritten to gpu.thread_id according to the thread_dim_mapping			/// are rewritten to gpu.thread_id according to the thread_dim_mapping
	/// attribute.			/// attribute.
	/// Sibling `scf.forall` are supported in which case, the union of the number of			/// Sibling `scf.forall` are supported in which case, the union of the number of
	/// threads is computed and may result in predication.			/// threads is computed and may result in predication.
	/// Dynamic, `scf.forall` trip counts are currently not supported.			/// Dynamic, `scf.forall` trip counts are currently not supported.
	/// Dynamic block dim sizes are currently not supported.			/// Dynamic block dim sizes are currently not supported.
	DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(			DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(
	RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,			RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
	Operation *target, const SmallVectorImpl<int64_t> &kernelBlockDims,			Operation *target, const SmallVectorImpl<int64_t> &kernelBlockDims,
	bool syncAfterDistribute,			bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder);
	const ArrayRef<DeviceMappingAttrInterface> &threadMappingAttributes,
	function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>
	threadIdGenerator);

	/// Find the unique top level scf::ForallOp within a given target op.			/// Find the unique top level scf::ForallOp within a given target op.
	DiagnosedSilenceableFailure			DiagnosedSilenceableFailure
	findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,			findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,
	TransformOpInterface transformOp);			TransformOpInterface transformOp);

	} // namespace gpu			} // namespace gpu
	} // namespace transform			} // namespace transform

	namespace gpu {			namespace gpu {
	void registerTransformDialectExtension(DialectRegistry &registry);			void registerTransformDialectExtension(DialectRegistry &registry);
	} // namespace gpu			} // namespace gpu
	} // namespace mlir			} // namespace mlir

	#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H			#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H

mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp

//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//		//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"		#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"

		#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"		#include "mlir/Dialect/Arith/IR/Arith.h"
		#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"		#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"		#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
#include "mlir/Dialect/PDL/IR/PDL.h"		#include "mlir/Dialect/PDL/IR/PDL.h"
#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"		#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
#include "mlir/Dialect/SCF/IR/SCF.h"		#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"		#include "mlir/Dialect/Transform/IR/TransformDialect.h"
#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"		#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
		#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/IR/BuiltinAttributes.h"		#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/IRMapping.h"		#include "mlir/IR/IRMapping.h"
		#include "mlir/IR/MLIRContext.h"
#include "mlir/IR/OpDefinition.h"		#include "mlir/IR/OpDefinition.h"
#include "mlir/Support/LLVM.h"		#include "mlir/Support/LLVM.h"
#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"		#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"

using namespace mlir;		using namespace mlir;
using namespace mlir::gpu;		using namespace mlir::gpu;
using namespace mlir::transform;		using namespace mlir::transform;
		using namespace mlir::transform::gpu;

#define DEBUG_TYPE "gpu-transforms"		#define DEBUG_TYPE "gpu-transforms"

#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")		#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")		#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")

namespace {		namespace {

/// Helper type for functions that generate ids for the mapping of a scf.forall.		/// Return a flatten thread id for the workgroup with given sizes.
using IdGeneratorFnType = llvm::function_ref<void(RewriterBase &, scf::ForallOp,		static OpFoldResult getLinearThreadId(RewriterBase &rewriter, Location loc) {
SmallVectorImpl<Value> &)>;		AffineExpr tx, ty, tz, BDX, BDY;
		bindDims(rewriter.getContext(), tx, ty, tz);
struct MappingToGpuHelper {		bindSymbols(rewriter.getContext(), BDX, BDY);
MappingToGpuHelper(SmallVector<DeviceMappingAttrInterface> mappingAttributes,		IndexType indexType = rewriter.getIndexType();
IdGeneratorFnType idGenerator)		SmallVector<OpFoldResult> threadsAndWorkGroups{
: mappingAttributes(mappingAttributes), idGenerator(idGenerator) {}		rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x).getResult(),
		rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y).getResult(),
SmallVector<DeviceMappingAttrInterface> mappingAttributes;		rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z).getResult()};
IdGeneratorFnType idGenerator;		threadsAndWorkGroups.push_back(
};		rewriter.create<BlockDimOp>(loc, indexType, Dimension::x).getResult());
		threadsAndWorkGroups.push_back(
		rewriter.create<BlockDimOp>(loc, indexType, Dimension::y).getResult());
		return makeComposedFoldedAffineApply(
		rewriter, loc, tx + ty * BDX + tz * BDX * BDY, threadsAndWorkGroups);
		}

struct MappingToGpuBlocksHelper : public MappingToGpuHelper {		struct GpuBlockIdBuilder : public GpuIdBuilder {

MappingToGpuBlocksHelper(MLIRContext *ctx)		GpuBlockIdBuilder(MLIRContext *ctx) : GpuIdBuilder() {
: MappingToGpuHelper(		mappingAttributes = {GPUBlockMappingAttr::get(ctx, Blocks::DimX),
SmallVector<DeviceMappingAttrInterface>{
GPUBlockMappingAttr::get(ctx, Blocks::DimX),
GPUBlockMappingAttr::get(ctx, Blocks::DimY),		GPUBlockMappingAttr::get(ctx, Blocks::DimY),
GPUBlockMappingAttr::get(ctx, Blocks::DimZ)},		GPUBlockMappingAttr::get(ctx, Blocks::DimZ)},
IdGeneratorFnType{[](RewriterBase &rewriter, scf::ForallOp forallOp,		idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp,
SmallVectorImpl<Value> &ids) {		ArrayRef<int64_t> mappingDims) {
OpBuilder::InsertionGuard guard(rewriter);		OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(forallOp);		rewriter.setInsertionPoint(forallOp);
IndexType indexType = rewriter.getIndexType();		IndexType indexType = rewriter.getIndexType();
auto loc = forallOp->getLoc();		auto loc = forallOp->getLoc();
ids.assign(		return SmallVector<Value>{
{rewriter.create<BlockIdOp>(loc, indexType, Dimension::x),		rewriter.create<BlockIdOp>(loc, indexType, Dimension::x),
rewriter.create<BlockIdOp>(loc, indexType, Dimension::y),		rewriter.create<BlockIdOp>(loc, indexType, Dimension::y),
rewriter.create<BlockIdOp>(loc, indexType, Dimension::z)});		rewriter.create<BlockIdOp>(loc, indexType, Dimension::z)};
}}) {}		};
		}
};		};

struct MappingToGpuThreadsHelper : public MappingToGpuHelper {		struct GpuThreadIdBuilder : public GpuIdBuilder {
MappingToGpuThreadsHelper(MLIRContext *ctx)		GpuThreadIdBuilder(MLIRContext ctx, SmallVector<Value> idCaptures = nullptr)
: MappingToGpuHelper(		: GpuIdBuilder() {
SmallVector<DeviceMappingAttrInterface>{		mappingAttributes = {GPUThreadMappingAttr::get(ctx, Threads::DimX),
GPUThreadMappingAttr::get(ctx, Threads::DimX),
GPUThreadMappingAttr::get(ctx, Threads::DimY),		GPUThreadMappingAttr::get(ctx, Threads::DimY),
GPUThreadMappingAttr::get(ctx, Threads::DimZ)},		GPUThreadMappingAttr::get(ctx, Threads::DimZ)};
IdGeneratorFnType{[](RewriterBase &rewriter, scf::ForallOp forallOp,		idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp,
SmallVectorImpl<Value> &ids) {		ArrayRef<int64_t> mappingDims) {
OpBuilder::InsertionGuard guard(rewriter);		OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(forallOp);		rewriter.setInsertionPoint(forallOp);
IndexType indexType = rewriter.getIndexType();		IndexType indexType = rewriter.getIndexType();
auto loc = forallOp->getLoc();		auto loc = forallOp->getLoc();
ids.assign(		return SmallVector<Value>{
{rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),		rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),
rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),		rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),
rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)});		rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)};
}}) {}		};
		}
		};

		struct GpuWarpIdBuilder : public GpuIdBuilder {
		GpuWarpIdBuilder(MLIRContext ctx, SmallVector<Value> idCaptures = nullptr)
		: GpuIdBuilder() {
		mappingAttributes = {GPUWarpMappingAttr::get(ctx, Warps::DimX),
		GPUWarpMappingAttr::get(ctx, Warps::DimY),
		GPUWarpMappingAttr::get(ctx, Warps::DimZ)};
		idBuilder = [](RewriterBase &rewriter, scf::ForallOp forallOp,
		ArrayRef<int64_t> mappingDims) {
		OpBuilder::InsertionGuard guard(rewriter);
		rewriter.setInsertionPoint(forallOp);
		Location loc = forallOp.getLoc();
		Value warpId = rewriter.create<SubgroupIdOp>(loc);
		SmallVector<int64_t> reverseBlockDims(llvm::reverse(mappingDims));
		SmallVector<int64_t> strides = computeStrides(reverseBlockDims);
		AffineExpr d0;
		bindDims(rewriter.getContext(), d0);
		SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
		SmallVector<Value> ids;
		for (AffineExpr e : delinearizingExprs)
		ids.push_back(makeComposedAffineApply(rewriter, loc, e, warpId));
		return ids;
		};
		}
		bondhugulaUnsubmitted Done Reply Inline Actions Missing code comments for these. bondhugula: Missing code comments for these.
};		};

} // namespace		} // namespace

static DiagnosedSilenceableFailure		static DiagnosedSilenceableFailure
failureHelper(std::optional<TransformOpInterface> transformOp,		failureHelper(std::optional<TransformOpInterface> transformOp,
scf::ForallOp forallOp, const Twine &message) {		scf::ForallOp forallOp, const Twine &message) {
if (transformOp.has_value())		if (transformOp.has_value())
return emitDefiniteFailure(*transformOp, message);		return transformOp->emitSilenceableError() << message;
return emitDefiniteFailure(forallOp, message);		return emitDefiniteFailure(forallOp, message);
}		}

/// Check if given mapping attributes are one of the desired attributes		/// Check if given mapping attributes are one of the desired attributes
static DiagnosedSilenceableFailure		static DiagnosedSilenceableFailure
checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,		checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
scf::ForallOp forallOp) {		scf::ForallOp forallOp) {
if (!forallOp.getMapping().has_value())		if (!forallOp.getMapping().has_value())
return failureHelper(transformOp, forallOp, "mapping must be present");		return failureHelper(transformOp, forallOp, "mapping must be present");

bool hasBlockMapping =		bool hasBlockMapping =
llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {		llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
return attr.isa<GPUBlockMappingAttr>();		return attr.isa<GPUBlockMappingAttr>();
});		});
bool hasThreadMapping =		bool hasThreadMapping =
llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {		llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
return attr.isa<GPUThreadMappingAttr>();		return attr.isa<GPUThreadMappingAttr>();
});		});
		bool hasWarpMapping =
		llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
		return attr.isa<GPUWarpMappingAttr>();
		});
int64_t countMappingTypes = 0;		int64_t countMappingTypes = 0;
countMappingTypes += hasBlockMapping ? 1 : 0;		countMappingTypes += hasBlockMapping ? 1 : 0;
countMappingTypes += hasThreadMapping ? 1 : 0;		countMappingTypes += hasThreadMapping ? 1 : 0;
		countMappingTypes += hasWarpMapping ? 1 : 0;
if (countMappingTypes > 1) {		if (countMappingTypes > 1) {
return failureHelper(transformOp, forallOp,		return failureHelper(transformOp, forallOp,
"cannot mix different mapping types, use nesting");		"cannot mix different mapping types, use nesting");
}		}

DenseSet<Attribute> seen;		DenseSet<Attribute> seen;
for (Attribute map : forallOp.getMapping()->getValue()) {		for (Attribute map : forallOp.getMapping()->getValue()) {
if (llvm::is_contained(seen, map)) {		if (llvm::is_contained(seen, map)) {
Show All 30 Lines	if (llvm::any_of(forallOp.getMixedUpperBound(), [&](OpFoldResult ofr) {
return !getConstantIntValue(ofr).has_value();		return !getConstantIntValue(ofr).has_value();
})) {		})) {
return failureHelper(transformOp, forallOp,		return failureHelper(transformOp, forallOp,
"unsupported dynamic sizes in forall op");		"unsupported dynamic sizes in forall op");
}		}
return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

/// Determines if the size of the kernel configuration is supported by the GPU		/// Determines if the size of the kernel configuration is supported by the
/// architecture being used. It presently makes use of CUDA limitations, however		/// GPU architecture being used. It presently makes use of CUDA limitations,
/// that aspect may be enhanced for other GPUs.		/// however that aspect may be enhanced for other GPUs.
static DiagnosedSilenceableFailure checkGpuLimits(		static DiagnosedSilenceableFailure checkGpuLimits(
TransformOpInterface transformOp, std::optional<int64_t> gridDimX,		TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,		std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,		std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
std::optional<int64_t> blockDimZ) {		std::optional<int64_t> blockDimZ) {

static constexpr int maxTotalBlockdim = 1024;		static constexpr int maxTotalBlockdim = 1024;
static constexpr int maxBlockdimx = 1024;		static constexpr int maxBlockdimx = 1024;
Show All 19 Lines	return transformOp.emitSilenceableError()
<< gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "		<< gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
<< gridDimZ.value_or(1) << ") blockDim = (" << blockDimX.value_or(1)		<< gridDimZ.value_or(1) << ") blockDim = (" << blockDimX.value_or(1)
<< ", " << blockDimY.value_or(1) << ", " << blockDimZ.value_or(1)		<< ", " << blockDimY.value_or(1) << ", " << blockDimZ.value_or(1)
<< "). It is larger than the limits.";		<< "). It is larger than the limits.";
}		}
return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

/// Creates an empty-body gpu::LaunchOp using the provided kernel settings and		/// Creates an empty-body gpu::LaunchOp using the provided kernel settings
/// put a terminator within.		/// and put a terminator within.
static DiagnosedSilenceableFailure		static DiagnosedSilenceableFailure
createGpuLaunch(RewriterBase &rewriter, Location loc,		createGpuLaunch(RewriterBase &rewriter, Location loc,
TransformOpInterface transformOp, LaunchOp &launchOp,		TransformOpInterface transformOp, LaunchOp &launchOp,
std::optional<int64_t> gridDimX = std::nullopt,		std::optional<int64_t> gridDimX = std::nullopt,
std::optional<int64_t> gridDimY = std::nullopt,		std::optional<int64_t> gridDimY = std::nullopt,
std::optional<int64_t> gridDimZ = std::nullopt,		std::optional<int64_t> gridDimZ = std::nullopt,
std::optional<int64_t> blockDimX = std::nullopt,		std::optional<int64_t> blockDimX = std::nullopt,
std::optional<int64_t> blockDimY = std::nullopt,		std::optional<int64_t> blockDimY = std::nullopt,
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	if (blockDimY.has_value())
gpuLaunch.getBlockSizeYMutable().assign(		gpuLaunch.getBlockSizeYMutable().assign(
createConstValue(blockDimY.value()));		createConstValue(blockDimY.value()));
if (blockDimZ.has_value())		if (blockDimZ.has_value())
gpuLaunch.getBlockSizeZMutable().assign(		gpuLaunch.getBlockSizeZMutable().assign(
createConstValue(blockDimZ.value()));		createConstValue(blockDimZ.value()));
return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

		/// Struct to return the result of the rewrite of a forall operation.
		struct ForallRewriteResult {
		SmallVector<int64_t> mappingSizes;
		SmallVector<Value> mappingIds;
		};

		/// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
		static void
		replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc,
		ValueRange mappingIds,
		ArrayRef<int64_t> availableMappingSizes) {
		assert(!mappingIds.empty() && "expected some mapping");
		OpBuilder::InsertionGuard g(rewriter);
		rewriter.setInsertionPoint(mappingIds.front().getDefiningOp());
		Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
		for (auto [dim, id] : llvm::zip_equal(availableMappingSizes, mappingIds)) {
		if (dim == 1)
		rewriter.replaceAllUsesWith(id, zero);
		}
		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// MapForallToBlocks		// MapForallToBlocks
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

static FailureOr<SmallVector<int64_t>> rewriteOneForallCommonImpl(		static FailureOr<ForallRewriteResult> rewriteOneForallCommonImpl(
RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,		RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
scf::ForallOp forallOp,		scf::ForallOp forallOp,
const SmallVectorImpl<int64_t> &availableMappingSizes,		const SmallVectorImpl<int64_t> &availableMappingSizes,
const ArrayRef<DeviceMappingAttrInterface> &allMappingAttributes,		const GpuIdBuilder &gpuIdBuilder) {
IdGeneratorFnType idGenerator) {
LDBG("Start rewriteOneForallCommonImpl");		LDBG("Start rewriteOneForallCommonImpl");

// Step 0. GPU-specific verifications. There is no better place to anchor		// Step 0. GPU-specific verifications. There is no better place to anchor
// those right now: the ForallOp is target-independent and the transform op		// those right now: the ForallOp is target-independent and the transform
// does not apply to individual ForallOp.		// op does not apply to individual ForallOp.
DiagnosedSilenceableFailure diag = verifyGpuMapping(transformOp, forallOp);		DiagnosedSilenceableFailure diag = verifyGpuMapping(transformOp, forallOp);
if (!diag.succeeded())		if (!diag.succeeded())
return failure();		return failure();

// Step 1. Complete the mapping to a full mapping (with 1s) if necessary.		// Step 1. Complete the mapping to a full mapping (with 1s) if necessary.
SmallVector<int64_t> tmpMappingSizes = llvm::to_vector(		SmallVector<int64_t> tmpMappingSizes = llvm::to_vector(
llvm::map_range(forallOp.getMixedUpperBound(), [](OpFoldResult ofr) {		llvm::map_range(forallOp.getMixedUpperBound(), [](OpFoldResult ofr) {
auto maybeStaticValue = getConstantIntValue(ofr);		auto maybeStaticValue = getConstantIntValue(ofr);
assert(maybeStaticValue && "expected static value");		assert(maybeStaticValue && "expected static value");
return maybeStaticValue.value();		return maybeStaticValue.value();
}));		}));
SmallVector<Attribute> forallMappings =		SmallVector<Attribute> forallMappings =
llvm::to_vector(forallOp.getMapping()->getValue());		llvm::to_vector(forallOp.getMapping()->getValue());
for (auto attr : allMappingAttributes) {		for (auto attr : gpuIdBuilder.mappingAttributes) {
if (llvm::is_contained(forallMappings, attr))		if (llvm::is_contained(forallMappings, attr))
continue;		continue;
forallMappings.push_back(attr);		forallMappings.push_back(attr);
tmpMappingSizes.push_back(1);		tmpMappingSizes.push_back(1);
}		}

// Step 2. sort the values by the corresponding DeviceMappingAttrInterface.		// Step 2. sort the values by the corresponding
		// DeviceMappingAttrInterface.
auto comparator = [&](DeviceMappingAttrInterface a,		auto comparator = [&](DeviceMappingAttrInterface a,
DeviceMappingAttrInterface b) -> bool {		DeviceMappingAttrInterface b) -> bool {
return a.getMappingId() < b.getMappingId();		return a.getMappingId() < b.getMappingId();
};		};
SmallVector<int64_t> mappingSizes =		SmallVector<int64_t> mappingSizes =
getValuesSortedByKey(forallMappings, tmpMappingSizes, comparator);		getValuesSortedByKey(forallMappings, tmpMappingSizes, comparator);
LLVM_DEBUG(llvm::interleaveComma(mappingSizes, DBGS() << "mappingSizes: ");		LLVM_DEBUG(llvm::interleaveComma(mappingSizes, DBGS() << "mappingSizes: ");
llvm::dbgs() << "\n";		llvm::dbgs() << "\n";
llvm::interleaveComma(forallMappings, DBGS() << "mappingAttrs: ");		llvm::interleaveComma(forallMappings, DBGS() << "mappingAttrs: ");
llvm::dbgs() << "\n");		llvm::dbgs() << "\n");

// Step 3. Generate the mappingIdOps using the provided generator and map the		// Step 3. Generate the mappingIdOps using the provided generator and map
// induction variables to the newly created ops. Replace ids of dimension		// the induction variables to the newly created ops.
// known to be of size 1 by zero to simplify the IR.		SmallVector<Value> mappingIdOps =
SmallVector<Value> mappingIdOps;		gpuIdBuilder.idBuilder(rewriter, forallOp, mappingSizes);
Location loc = forallOp.getLoc();
idGenerator(rewriter, forallOp, mappingIdOps);
LLVM_DEBUG(llvm::interleaveComma(mappingIdOps, DBGS() << "mappingIdOps: ");		LLVM_DEBUG(llvm::interleaveComma(mappingIdOps, DBGS() << "mappingIdOps: ");
llvm::dbgs() << "\n");		llvm::dbgs() << "\n");
assert(mappingIdOps.size() == mappingSizes.size() && "expect equal sizes");		assert(mappingIdOps.size() == mappingSizes.size() && "expect equal sizes");
Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
if (!availableMappingSizes.empty()) {
for (size_t i : llvm::seq(size_t(0), availableMappingSizes.size())) {
if (availableMappingSizes[i] == 1)
mappingIdOps[i] = zero;
}
}

IRMapping bvm;		IRMapping bvm;
for (auto [iv, dim] :		for (auto [iv, dim] :
llvm::zip_equal(forallOp.getInductionVars(),		llvm::zip_equal(forallOp.getInductionVars(),
ArrayRef<Attribute>{forallMappings}.take_front(		ArrayRef<Attribute>{forallMappings}.take_front(
forallOp.getInductionVars().size()))) {		forallOp.getInductionVars().size()))) {
Value peIdOp = mappingIdOps[static_cast<int64_t>(		Value peIdOp = mappingIdOps[static_cast<int64_t>(
dim.cast<DeviceMappingAttrInterface>().getMappingId())];		dim.cast<DeviceMappingAttrInterface>().getMappingId())];
bvm.map(iv, peIdOp);		bvm.map(iv, peIdOp);
}		}

// Step 4. Maybe create conditionals to predicate the region.		// Step 4. Maybe create conditionals to predicate the region.
// Skip this step when availableMappingSizes is empty.		// Skip this step when availableMappingSizes is empty.
		Location loc = forallOp.getLoc();
Value predicate;		Value predicate;
if (!availableMappingSizes.empty()) {		if (!availableMappingSizes.empty()) {
LLVM_DEBUG(llvm::interleaveComma(availableMappingSizes,		LLVM_DEBUG(llvm::interleaveComma(availableMappingSizes,
DBGS() << "availableMappingSizes: ");		DBGS() << "availableMappingSizes: ");
llvm::dbgs() << "\n");		llvm::dbgs() << "\n");
for (auto [id, mappingSize, availableMappingSize] :		for (auto [id, mappingSize, availableMappingSize] :
llvm::zip_equal(mappingIdOps, mappingSizes, availableMappingSizes)) {		llvm::zip_equal(mappingIdOps, mappingSizes, availableMappingSizes)) {
if (mappingSize > availableMappingSize) {		if (mappingSize > availableMappingSize) {
Show All 19 Lines	static FailureOr<ForallRewriteResult> rewriteOneForallCommonImpl(

// Step 5. Move the body of forallOp.		// Step 5. Move the body of forallOp.
// Erase the terminator first, it will not be used.		// Erase the terminator first, it will not be used.
rewriter.eraseOp(forallOp.getTerminator());		rewriter.eraseOp(forallOp.getTerminator());
Block *targetBlock;		Block *targetBlock;
Block::iterator insertionPoint;		Block::iterator insertionPoint;
if (predicate) {		if (predicate) {
// Step 5.a. If predicated, move at the beginning.		// Step 5.a. If predicated, move at the beginning.
auto ifOp =		auto ifOp = rewriter.create<scf::IfOp>(loc, predicate,
rewriter.create<scf::IfOp>(loc, predicate, /withElseRegion=/false);		/withElseRegion=/false);
targetBlock = ifOp.thenBlock();		targetBlock = ifOp.thenBlock();
insertionPoint = ifOp.thenBlock()->begin();		insertionPoint = ifOp.thenBlock()->begin();
} else {		} else {
// Step 5.b. Otherwise, move inline just at the rewriter insertion point.		// Step 5.b. Otherwise, move inline just at the rewriter insertion
		// point.
targetBlock = forallOp->getBlock();		targetBlock = forallOp->getBlock();
insertionPoint = rewriter.getInsertionPoint();		insertionPoint = rewriter.getInsertionPoint();
}		}
Block &sourceBlock = forallOp.getRegion().front();		Block &sourceBlock = forallOp.getRegion().front();
targetBlock->getOperations().splice(insertionPoint,		targetBlock->getOperations().splice(insertionPoint,
sourceBlock.getOperations());		sourceBlock.getOperations());

// Step 6. RAUW thread indices to thread ops.		// Step 6. RAUW thread indices to thread ops.
for (Value loopIndex : forallOp.getInductionVars()) {		for (Value loopIndex : forallOp.getInductionVars()) {
Value threadIdx = bvm.lookup(loopIndex);		Value threadIdx = bvm.lookup(loopIndex);
rewriter.replaceAllUsesWith(loopIndex, threadIdx);		rewriter.replaceAllUsesWith(loopIndex, threadIdx);
}		}

// Step 7. Erase old op.		// Step 7. Erase old op.
rewriter.eraseOp(forallOp);		rewriter.eraseOp(forallOp);

return mappingSizes;		return ForallRewriteResult{mappingSizes, mappingIdOps};
}		}

DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl(		DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl(
RewriterBase &rewriter, TransformOpInterface transformOp,		RewriterBase &rewriter, TransformOpInterface transformOp,
scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,		scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,
const ArrayRef<DeviceMappingAttrInterface> &allMappingAttributes,		const GpuIdBuilder &gpuIdBuilder) {
IdGeneratorFnType idGenerator) {
// Pass an empty anyAvailableMappingSizes.		// Pass an empty anyAvailableMappingSizes.
		Location loc = forallOp.getLoc();
SmallVector<int64_t> anyAvailableMappingSizes;		SmallVector<int64_t> anyAvailableMappingSizes;
FailureOr<SmallVector<int64_t>> maybeMappingSizes =		FailureOr<ForallRewriteResult> rewriteResult = rewriteOneForallCommonImpl(
rewriteOneForallCommonImpl(rewriter, transformOp, forallOp,		rewriter, transformOp, forallOp, anyAvailableMappingSizes, gpuIdBuilder);
anyAvailableMappingSizes, allMappingAttributes,
idGenerator);		// Fail if anything goes wrong.
if (failed(maybeMappingSizes))		if (failed(rewriteResult))
		ThomasRaouxUnsubmitted Done Reply Inline Actions nit: doesn't feel like a very useful comment ThomasRaoux: nit: doesn't feel like a very useful comment
return DiagnosedSilenceableFailure::definiteFailure();		return DiagnosedSilenceableFailure::definiteFailure();
gridDims = *maybeMappingSizes;		gridDims = rewriteResult->mappingSizes;

		// Replace ids of dimensions known to be 1 by 0 to simplify the IR.
		// Here, the result of mapping determines the available mapping sizes.
		replaceUnitMappingIdsHelper(rewriter, loc, rewriteResult->mappingIds,
		gridDims);

return DiagnosedSilenceableFailure::success();		return DiagnosedSilenceableFailure::success();
}		}

DiagnosedSilenceableFailure		DiagnosedSilenceableFailure
mlir::transform::gpu::findTopLevelForallOp(Operation *target,		mlir::transform::gpu::findTopLevelForallOp(Operation *target,
scf::ForallOp &topLevelForallOp,		scf::ForallOp &topLevelForallOp,
TransformOpInterface transformOp) {		TransformOpInterface transformOp) {
auto walkResult = target->walk([&](scf::ForallOp forallOp) {		auto walkResult = target->walk([&](scf::ForallOp forallOp) {
▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	if (getGenerateGpuLaunch()) {
rewriter.eraseOp(topLevelForallOp);		rewriter.eraseOp(topLevelForallOp);
topLevelForallOp = cast<scf::ForallOp>(newForallOp);		topLevelForallOp = cast<scf::ForallOp>(newForallOp);
}		}

diag = verifyGpuMapping(transformOp, topLevelForallOp);		diag = verifyGpuMapping(transformOp, topLevelForallOp);
if (!diag.succeeded())		if (!diag.succeeded())
return diag;		return diag;

MappingToGpuBlocksHelper helper(getContext());		GpuBlockIdBuilder gpuBlockIdBuilder(getContext());
diag = mlir::transform::gpu::mapForallToBlocksImpl(		diag = mlir::transform::gpu::mapForallToBlocksImpl(
rewriter, transformOp, topLevelForallOp, gridDims,		rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
helper.mappingAttributes, helper.idGenerator);
if (!diag.succeeded())		if (!diag.succeeded())
return diag;		return diag;

diag = alterGpuLaunch(rewriter, gpuLaunch,		diag = alterGpuLaunch(rewriter, gpuLaunch,
cast<TransformOpInterface>(getOperation()), gridDims[0],		cast<TransformOpInterface>(getOperation()), gridDims[0],
gridDims[1], gridDims[2]);		gridDims[1], gridDims[2]);

results.push_back(gpuLaunch);		results.push_back(gpuLaunch);
return diag;		return diag;
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// MapNestedForallToThreads		// MapNestedForallToThreads
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl(		DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl(
RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,		RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
Operation *target, const SmallVectorImpl<int64_t> &kernelBlockDims,		Operation *target, const SmallVectorImpl<int64_t> &kernelBlockDims,
bool syncAfterDistribute,		bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder) {
const ArrayRef<DeviceMappingAttrInterface> &allMappingAttributes,
IdGeneratorFnType idGenerator) {
DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();		DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
target->walk([&](scf::ForallOp forallOp) {		target->walk([&](scf::ForallOp forallOp) {
// Ignore cases with different attributes.		// Ignore cases with different attributes than this builder supports.
for (Attribute map : forallOp.getMapping()->getValue()) {		for (Attribute map : forallOp.getMapping()->getValue()) {
if (!llvm::is_contained(allMappingAttributes, map)) {		if (!llvm::is_contained(gpuIdBuilder.mappingAttributes, map)) {
return WalkResult::skip();		return WalkResult::skip();
}		}
}		}
diag = verifyGpuMapping(transformOp, forallOp);		diag = verifyGpuMapping(transformOp, forallOp);
if (diag.succeeded()) {		if (diag.succeeded()) {
// Take the loc ahead of time
Location loc = forallOp.getLoc();		Location loc = forallOp.getLoc();
OpBuilder::InsertionGuard g(rewriter);		OpBuilder::InsertionGuard g(rewriter);
		// Insert after to allow for syncthreads after `forall` is erased.
rewriter.setInsertionPointAfter(forallOp);		rewriter.setInsertionPointAfter(forallOp);
if (failed(rewriteOneForallCommonImpl(rewriter, transformOp, forallOp,		FailureOr<ForallRewriteResult> rewriteResult = rewriteOneForallCommonImpl(
kernelBlockDims,		rewriter, transformOp, forallOp, kernelBlockDims, gpuIdBuilder);
allMappingAttributes, idGenerator)))
		// Fail if anything goes wrong.
		if (failed(rewriteResult))
diag = DiagnosedSilenceableFailure::definiteFailure();		diag = DiagnosedSilenceableFailure::definiteFailure();

// Add a syncthreads if needed. TODO: warpsync		// Add a syncthreads if needed. TODO: warpsync
if (syncAfterDistribute)		if (syncAfterDistribute)
rewriter.create<BarrierOp>(loc);		rewriter.create<BarrierOp>(loc);

		// Replace ids of dimensions known to be 1 by 0 to simplify the IR.
		// Here, the available mapping sizes are given by `kernelBlockDims`.
		replaceUnitMappingIdsHelper(rewriter, loc, rewriteResult->mappingIds,
		kernelBlockDims);
}		}
return diag.succeeded() ? WalkResult::advance() : WalkResult::interrupt();		return diag.succeeded() ? WalkResult::advance() : WalkResult::interrupt();
});		});
return diag;		return diag;
}		}

DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne(		DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne(
Operation *target, ApplyToEachResultList &results, TransformState &state) {		Operation *target, ApplyToEachResultList &results, TransformState &state) {
Show All 13 Lines	DiagnosedSilenceableFailure diag =
blockDims[0], blockDims[1], blockDims[2]);		blockDims[0], blockDims[1], blockDims[2]);
if (diag.isSilenceableFailure()) {		if (diag.isSilenceableFailure()) {
diag.attachNote(getLoc()) << getBlockDimAttrName() << " is too large";		diag.attachNote(getLoc()) << getBlockDimAttrName() << " is too large";
return diag;		return diag;
}		}

MLIRContext *ctx = getContext();		MLIRContext *ctx = getContext();
IRRewriter rewriter(ctx);		IRRewriter rewriter(ctx);
MappingToGpuThreadsHelper helper(ctx);		SmallVector<Value> idCaptures;
		GpuThreadIdBuilder gpuThreadIdBuilder(ctx, &idCaptures);
diag = mlir::transform::gpu::mapNestedForallToThreadsImpl(		diag = mlir::transform::gpu::mapNestedForallToThreadsImpl(
rewriter, transformOp, target, blockDims, getSyncAfterDistribute(),		rewriter, transformOp, target, blockDims, getSyncAfterDistribute(),
helper.mappingAttributes, helper.idGenerator);		gpuThreadIdBuilder);

if (!diag.succeeded())		if (!diag.succeeded())
return diag;		return diag;

diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, std::nullopt,		diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, std::nullopt,
std::nullopt, std::nullopt, blockDims[0], blockDims[1],		std::nullopt, std::nullopt, blockDims[0], blockDims[1],
blockDims[2]);		blockDims[2]);

Show All 34 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Transform] Add support for mapping to GPU warps and to linear ids
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 505447

mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h

mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Transform] Add support for mapping to GPU warps and to linear idsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 505447

mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h

mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp

[mlir][Transform] Add support for mapping to GPU warps and to linear ids
ClosedPublic