Diff 252355

mlir/include/mlir/Dialect/GPU/CMakeLists.txt

	add_mlir_dialect(GPUOps gpu GPUOps)			add_mlir_dialect(GPUOps gpu GPUOps)

				set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td)
				mlir_tablegen(ParallelLoopMapperAttr.h.inc -gen-struct-attr-decls)
				mlir_tablegen(ParallelLoopMapperAttr.cpp.inc -gen-struct-attr-defs)
				add_public_tablegen_target(MLIRParallelLoopMapperAttrGen)

				set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td)
				mlir_tablegen(ParallelLoopMapperEnums.h.inc -gen-enum-decls)
				mlir_tablegen(ParallelLoopMapperEnums.cpp.inc -gen-enum-defs)
				add_public_tablegen_target(MLIRParallelLoopMapperEnumsGen)

mlir/include/mlir/Dialect/GPU/GPUBase.td

This file was added.

				//===-- GPUBase.td - GPU dialect definitions ---------------- tablegen --===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// Defines the GPU dialect
				//
				//===----------------------------------------------------------------------===//

				#ifndef GPU_BASE
				#define GPU_BASE

				include "mlir/IR/OpBase.td"

				//===----------------------------------------------------------------------===//
				// GPU Dialect.
				//===----------------------------------------------------------------------===//

				def GPU_Dialect : Dialect {
				let name = "gpu";
				let hasOperationAttrVerify = 1;

				let extraClassDeclaration = [{
				/// Get the name of the attribute used to annotate the modules that contain
				/// kernel modules.
				static StringRef getContainerModuleAttrName() {
				return "gpu.container_module";
				}
				/// Get the name of the attribute used to annotate external kernel
				/// functions.
				static StringRef getKernelFuncAttrName() { return "gpu.kernel"; }

				/// Get the name of the attribute used to annotate kernel modules.
				static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; }

				/// Returns whether the given function is a kernel function, i.e., has the
				/// 'gpu.kernel' attribute.
				static bool isKernel(Operation *op);

				/// Returns the number of workgroup (thread, block) dimensions supported in
				/// the GPU dialect.
				// TODO(zinenko,herhut): consider generalizing this.
				static unsigned getNumWorkgroupDimensions() { return 3; }

				/// Returns the numeric value used to identify the workgroup memory address
				/// space.
				static unsigned getWorkgroupAddressSpace() { return 3; }

				/// Returns the numeric value used to identify the private memory address
				/// space.
				static unsigned getPrivateAddressSpace() { return 5; }
				}];
				}

				#endif // GPU_BASE

mlir/include/mlir/Dialect/GPU/GPUOps.td

	//===-- GPUOps.td - GPU dialect operation definitions ------- tablegen --===//			//===-- GPUOps.td - GPU dialect operation definitions ------- tablegen --===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// Defines some operations of the GPU dialect.			// Defines some operations of the GPU dialect.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef GPU_OPS			#ifndef GPU_OPS
	#define GPU_OPS			#define GPU_OPS

				include "mlir/Dialect/GPU/GPUBase.td"
	include "mlir/Dialect/LLVMIR/LLVMOpBase.td"			include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
	include "mlir/Interfaces/SideEffects.td"			include "mlir/Interfaces/SideEffects.td"

	// Type constraint accepting standard integers, indices and wrapped LLVM integer			// Type constraint accepting standard integers, indices and wrapped LLVM integer
	// types.			// types.
	def IntLikeOrLLVMInt : TypeConstraint<			def IntLikeOrLLVMInt : TypeConstraint<
	Or<[AnySignlessInteger.predicate, Index.predicate, LLVMInt.predicate]>,			Or<[AnySignlessInteger.predicate, Index.predicate, LLVMInt.predicate]>,
	"integer, index or LLVM dialect equivalent">;			"integer, index or LLVM dialect equivalent">;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// GPU Dialect operations.			// GPU Dialect operations.
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	def GPU_Dialect : Dialect {
	let name = "gpu";
	let hasOperationAttrVerify = 1;

	let extraClassDeclaration = [{
	/// Get the name of the attribute used to annotate the modules that contain
	/// kernel modules.
	static StringRef getContainerModuleAttrName() {
	return "gpu.container_module";
	}
	/// Get the name of the attribute used to annotate external kernel
	/// functions.
	static StringRef getKernelFuncAttrName() { return "gpu.kernel"; }

	/// Get the name of the attribute used to annotate kernel modules.
	static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; }

	/// Returns whether the given function is a kernel function, i.e., has the
	/// 'gpu.kernel' attribute.
	static bool isKernel(Operation *op);

	/// Returns the number of workgroup (thread, block) dimensions supported in
	/// the GPU dialect.
	// TODO(zinenko,herhut): consider generalizing this.
	static unsigned getNumWorkgroupDimensions() { return 3; }

	/// Returns the numeric value used to identify the workgroup memory address
	/// space.
	static unsigned getWorkgroupAddressSpace() { return 3; }

	/// Returns the numeric value used to identify the private memory address
	/// space.
	static unsigned getPrivateAddressSpace() { return 5; }
	}];
	}

	class GPU_Op<string mnemonic, list<OpTrait> traits = []> :			class GPU_Op<string mnemonic, list<OpTrait> traits = []> :
	Op<GPU_Dialect, mnemonic, traits>;			Op<GPU_Dialect, mnemonic, traits>;

	class GPU_IndexOp<string mnemonic, list<OpTrait> traits = []> :			class GPU_IndexOp<string mnemonic, list<OpTrait> traits = []> :
	GPU_Op<mnemonic, !listconcat(traits, [NoSideEffect])>,			GPU_Op<mnemonic, !listconcat(traits, [NoSideEffect])>,
	Arguments<(ins StrAttr:$dimension)>, Results<(outs Index)> {			Arguments<(ins StrAttr:$dimension)>, Results<(outs Index)> {
	let verifier = [{ return ::verifyIndexOp(*this); }];			let verifier = [{ return ::verifyIndexOp(*this); }];
	}			}
	▲ Show 20 Lines • Show All 608 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h

	//===- ParallelLoopMapper.h - Utilities for mapping parallel loops to GPU ====//			//===- ParallelLoopMapper.h - Utilities for mapping parallel loops to GPU ====//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// This header file declares the utilities to generate mappings for parallel			// This header file declares the utilities to generate mappings for parallel
	// loops to GPU devices.			// loops to GPU devices.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H			#ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
	#define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H			#define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H

				#include "mlir/IR/Attributes.h"
				#include "mlir/Support/LLVM.h"
				#include "llvm/ADT/DenseMap.h"

				#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.h.inc"

	namespace mlir {			namespace mlir {

				class AffineMap;
				struct LogicalResult;
				class Operation;
	class Region;			class Region;

				#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.h.inc"

				namespace loop {
				class ParallelOp;
				}

	namespace gpu {			namespace gpu {

	/// Name of the mapping attribute produced by loop mappers.			/// Name of the mapping attribute produced by loop mappers.
	static constexpr const char *kMappingAttributeName = "mapping";			StringRef getMappingAttrName();
	/// Name of the processor sub-attribute that identifies the hardware id
	/// to map a loop to.
	static constexpr const char *kProcessorEntryName = "processor";
	/// Name of the map sub-attribute that identifies the affine map to apply
	/// to the hardware id to compute the iteration number of the loop. This
	/// map is expected to be extended by step and lower bound computations:
	/// index = map(hardware_id) * step + lowerbound
	static constexpr const char *kIndexMapEntryName = "map";
	/// Name of the bound sub-attribute that itendities the affine map to
	/// compute an upper bound of iterations for the hardware id. This is
	/// applied to an upper bound on the number of iterations:
	/// launchBound = bound(upperbound-lowerbound ceildiv step)
	static constexpr const char *kBoundMapEntryName = "bound";

				/// Get the value of the processor in the ParallelLoopDimMapping attribute.
				inline Processor getProcessor(ParallelLoopDimMapping attr) {
				return static_cast<Processor>(attr.processor().getInt());
				}

				/// Helper function to create a ParallelDimMapperAttr.
				/// TODO(ravishankarm/antiagainst): Replace its uses with an auto-gened method.
				ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor,
				AffineMap map,
				AffineMap bound);

				/// Sets the mapping attribute of a loop.parallel operation. Verifies that the
				/// mapping passed is valid.
				/// - the number of DimMapperAttr provided is same as the number of loops of
				/// the `ploopOp`.
				ftynseUnsubmitted Done Reply Inline Actions Nit: ploopOp does not refer to anything in the code ftynse: Nit: ploopOp does not refer to anything in the code
				/// - the mapping does not map multiple loops to the same processor.
				herhutUnsubmitted Done Reply Inline Actions This is not illegal. herhut: This is not illegal.
				mravishankarAuthorUnsubmitted Done Reply Inline Actions I can remove that check, but I was going by what is here : https://github.com/llvm/llvm-project/blob/5c261c9c452959985de19540c168b224af24e2d3/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp#L673 mravishankar: I can remove that check, but I was going by what is here : https://github.com/llvm/llvm…
				ftynseUnsubmitted Done Reply Inline Actions Add a TODO for Stephan to support this case in the pipeline and update the check? :) ftynse: Add a TODO for Stephan to support this case in the pipeline and update the check? :)
				LogicalResult setMappingAttr(loop::ParallelOp ploopOp,
				ArrayRef<ParallelLoopDimMapping> mapping);
	} // end namespace gpu			} // end namespace gpu

	/// Maps the parallel loops found in the given function to workgroups. The first			/// Maps the parallel loops found in the given function to workgroups. The first
	/// loop encountered will be mapped to the global workgroup and the second loop			/// loop encountered will be mapped to the global workgroup and the second loop
	/// encountered to the local workgroup. Within each mapping, the first three			/// encountered to the local workgroup. Within each mapping, the first three
	/// dimensions are mapped to x/y/z hardware ids and all following dimensions are			/// dimensions are mapped to x/y/z hardware ids and all following dimensions are
	/// mapped to sequential loops.			/// mapped to sequential loops.
	void greedilyMapParallelLoopsToGPU(Region &region);			void greedilyMapParallelLoopsToGPU(Region &region);

	} // end namespace mlir			} // end namespace mlir

	#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H			#endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H

mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td

This file was added.

				//===-- ParallelLoopMapperAttr.td - Attribute definition ---- tablegen --===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// Defines the attribute used for driving conversion from loop.parallel to
				// gpu.launch operations
				//
				//===----------------------------------------------------------------------===//

				#ifndef PARALLEL_LOOP_MAPPER_ATTR
				#define PARALLEL_LOOP_MAPPER_ATTR

				include "mlir/Dialect/AffineOps/AffineOpsBase.td"
				include "mlir/Dialect/GPU/GPUBase.td"

				def BlockX : I64EnumAttrCase<"BlockX", 0>;
				def BlockY : I64EnumAttrCase<"BlockY", 1>;
				antiagainstUnsubmitted Done Reply Inline Actions I'm wondering the convention here. Why all cap letters? `BlockX`, `ThreadY`/`Sequential`/etc. seems better to me. antiagainst: I'm wondering the convention here. Why all cap letters? `BlockX`, `ThreadY`/`Sequential`/etc.
				ftynseUnsubmitted Done Reply Inline Actions +1 for CamelCase. I used the same in the LLVM dialect for enumerants. ftynse: +1 for CamelCase. I used the same in the LLVM dialect for enumerants.
				def BlockZ : I64EnumAttrCase<"BlockZ", 2>;
				def ThreadX : I64EnumAttrCase<"ThreadX", 3>;
				def ThreadY : I64EnumAttrCase<"ThreadY", 4>;
				def ThreadZ : I64EnumAttrCase<"ThreadZ", 5>;
				def Sequential : I64EnumAttrCase<"Sequential", 6>;

				def ProcessorAttr : I64EnumAttr<"Processor", "processor for loop mapping", [
				BlockX, BlockY, BlockZ, ThreadX, ThreadY, ThreadZ, Sequential]> {
				let cppNamespace = "::mlir::gpu";
				}

				// Attribute that drives conversion of a loop.parallel to gpu.launch
				// operation.
				// processor: the hardware id to map to.
				// map : An affine map that is used to pre-process hardware ids before
				// substitution.
				// bound : An affine map that is used to compute the bound of the hardware
				// id based on an upper bound of the number of iterations.
				def ParallelLoopDimMappingAttr :
				StructAttr<"ParallelLoopDimMapping", GPU_Dialect,
				herhutUnsubmitted Not Done Reply Inline Actions `ParallelLoopDimMappingAttr` as it describes the mapping and not the mapper. herhut: `ParallelLoopDimMappingAttr` as it describes the mapping and not the mapper.
				[StructFieldAttr<"processor", ProcessorAttr>,
				StructFieldAttr<"map", AffineMapAttr>,
				StructFieldAttr<"bound", AffineMapAttr>]>;


				def ParallelLoopMappingAttr :
				TypedArrayAttrBase<ParallelLoopDimMappingAttr,
				"parallel loop to processor mapping attribute">;

				#endif // PARALLEL_LOOP_MAPPER_ATTR

mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp

Show First 20 Lines • Show All 494 Lines • ▼ Show 20 Lines

namespace {		namespace {
struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {		struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {
using OpRewritePattern<ParallelOp>::OpRewritePattern;		using OpRewritePattern<ParallelOp>::OpRewritePattern;

LogicalResult matchAndRewrite(ParallelOp parallelOp,		LogicalResult matchAndRewrite(ParallelOp parallelOp,
PatternRewriter &rewriter) const override;		PatternRewriter &rewriter) const override;
};		};

struct MappingAnnotation {
unsigned processor;
AffineMap indexMap;
AffineMap boundMap;
};

} // namespace		} // namespace

/// Extracts the mapping annotations from the provided attribute. The attribute
/// is expected to be of the form
/// { processor = <unsigned>, map = <AffineMap>, bound = <AffineMap> }
/// where the bound is optional.
static MappingAnnotation extractMappingAnnotation(Attribute attribute) {
DictionaryAttr dict = attribute.cast<DictionaryAttr>();
unsigned processor = dict.get(gpu::kProcessorEntryName)
.cast<IntegerAttr>()
.getValue()
.getSExtValue();
AffineMap map =
dict.get(gpu::kIndexMapEntryName).cast<AffineMapAttr>().getValue();
AffineMapAttr boundAttr =
dict.get(gpu::kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
AffineMap bound;
if (boundAttr)
bound = boundAttr.getValue();
return {processor, map, bound};
}

/// Tries to derive a static upper bound from the defining operation of		/// Tries to derive a static upper bound from the defining operation of
/// `upperBound`.		/// `upperBound`.
static Value deriveStaticUpperBound(Value upperBound,		static Value deriveStaticUpperBound(Value upperBound,
PatternRewriter &rewriter) {		PatternRewriter &rewriter) {
if (AffineMinOp minOp =		if (AffineMinOp minOp =
dyn_cast_or_null<AffineMinOp>(upperBound.getDefiningOp())) {		dyn_cast_or_null<AffineMinOp>(upperBound.getDefiningOp())) {
for (const AffineExpr &result : minOp.map().getResults()) {		for (const AffineExpr &result : minOp.map().getResults()) {
if (AffineConstantExpr constExpr =		if (AffineConstantExpr constExpr =
result.dyn_cast<AffineConstantExpr>()) {		result.dyn_cast<AffineConstantExpr>()) {
return rewriter.create<ConstantIndexOp>(minOp.getLoc(),		return rewriter.create<ConstantIndexOp>(minOp.getLoc(),
constExpr.getValue());		constExpr.getValue());
}		}
}		}
}		}
return {};		return {};
}		}

		static bool isMappedToProcessor(gpu::Processor processor) {
		return processor != gpu::Processor::Sequential;
		}

		static unsigned getLaunchOpArgumentNum(gpu::Processor processor) {
		antiagainstUnsubmitted Not Done Reply Inline Actions Isn't this just `static_cast<unsigned>(processor)` excluding `gpu::Processor::SEQUENTIAL`? Do we need this explicit switch here? antiagainst: Isn't this just `static_cast<unsigned>(processor)` excluding `gpu::Processor::SEQUENTIAL`? Do…
		mravishankarAuthorUnsubmitted Done Reply Inline Actions I am very uncomfortable relying on the enum value implicitly (especially mapping between two separate "enums" based on enum value). If one of the enums change (due to adding new enums), then it can lead to weird errors that can be hard to track down. Plus this is more readable IMO. Worth the cost of a switch-case? mravishankar: I am very uncomfortable relying on the enum value implicitly (especially mapping between two…
		switch (processor) {
		case gpu::Processor::BlockX:
		return 0;
		case gpu::Processor::BlockY:
		return 1;
		case gpu::Processor::BlockZ:
		return 2;
		case gpu::Processor::ThreadX:
		return 3;
		case gpu::Processor::ThreadY:
		return 4;
		case gpu::Processor::ThreadZ:
		return 5;
		default:;
		}
		llvm_unreachable(
		"invalid processor type while retrieving launch op argument number");
		}

/// Modifies the current transformation state to capture the effect of the given		/// Modifies the current transformation state to capture the effect of the given
/// `loop.parallel` operation on index substitutions and the operations to be		/// `loop.parallel` operation on index substitutions and the operations to be
/// inserted.		/// inserted.
/// Specifically, if a dimension of a parallel loop is mapped to a hardware id,		/// Specifically, if a dimension of a parallel loop is mapped to a hardware id,
/// this function will		/// this function will
/// - compute the loop index based on the hardware id and affine map from the		/// - compute the loop index based on the hardware id and affine map from the
/// mapping and update `cloningMap` to substitute all uses.		/// mapping and update `cloningMap` to substitute all uses.
/// - derive a new upper bound for the hardware id and augment the provided		/// - derive a new upper bound for the hardware id and augment the provided
/// `gpu.launch operation` accordingly.		/// `gpu.launch operation` accordingly.
/// - if the upper bound is imprecise, insert a conditional in the `gpu.launch`		/// - if the upper bound is imprecise, insert a conditional in the `gpu.launch`
/// and update the rewriter to insert into the conditional's body.		/// and update the rewriter to insert into the conditional's body.
/// If the dimension is mapped to sequential,		/// If the dimension is mapped to sequential,
/// - insert a for loop into the body and update the rewriter to insert into		/// - insert a for loop into the body and update the rewriter to insert into
/// the for loop's body.		/// the for loop's body.
/// - update the `cloningMap` to replace uses of the index with the index of		/// - update the `cloningMap` to replace uses of the index with the index of
/// the new for loop.		/// the new for loop.
/// In either case,		/// In either case,
/// - append the instructions from the loops body to worklist, in reverse order.		/// - append the instructions from the loops body to worklist, in reverse order.
/// To note the end of the current scope in case a loop or conditional was		/// To note the end of the current scope in case a loop or conditional was
/// inserted, a sentinel (the `gpu.launch` operation) is inserted into the		/// inserted, a sentinel (the `gpu.launch` operation) is inserted into the
/// worklist. This signals the processor of the worklist to pop the rewriter		/// worklist. This signals the processor of the worklist to pop the rewriter
/// one scope-level up.		/// one scope-level up.
static LogicalResult processParallelLoop(ParallelOp parallelOp,		static LogicalResult processParallelLoop(
gpu::LaunchOp launchOp,		ParallelOp parallelOp, gpu::LaunchOp launchOp,
BlockAndValueMapping &cloningMap,		BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist,
SmallVectorImpl<Operation *> &worklist,		DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) {
DenseMap<int, Value> &bounds,
PatternRewriter &rewriter) {
// TODO(herhut): Verify that this is a valid GPU mapping.		// TODO(herhut): Verify that this is a valid GPU mapping.
// processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential		// processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
ArrayAttr mapping =		ArrayAttr mapping =
parallelOp.getAttrOfType<ArrayAttr>(gpu::kMappingAttributeName);		parallelOp.getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());

// TODO(herhut): Support reductions.		// TODO(herhut): Support reductions.
if (!mapping \|\| parallelOp.getNumResults() != 0)		if (!mapping \|\| parallelOp.getNumResults() != 0)
return failure();		return failure();

Location loc = parallelOp.getLoc();		Location loc = parallelOp.getLoc();

auto launchIndependent = [&launchOp](Value val) {		auto launchIndependent = [&launchOp](Value val) {
Show All 10 Lines	static LogicalResult processParallelLoop(
};		};

for (auto config : llvm::zip(mapping, parallelOp.getInductionVars(),		for (auto config : llvm::zip(mapping, parallelOp.getInductionVars(),
parallelOp.lowerBound(), parallelOp.upperBound(),		parallelOp.lowerBound(), parallelOp.upperBound(),
parallelOp.step())) {		parallelOp.step())) {
Attribute mappingAttribute;		Attribute mappingAttribute;
Value iv, lowerBound, upperBound, step;		Value iv, lowerBound, upperBound, step;
std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;		std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
MappingAnnotation annotation = extractMappingAnnotation(mappingAttribute);		auto annotation = mappingAttribute.dyn_cast<gpu::ParallelLoopDimMapping>();
		if (!annotation)
		return parallelOp.emitOpError()
		<< "expected mapping attribute for lowering to GPU";
Value newIndex;		Value newIndex;
		gpu::Processor processor = gpu::getProcessor(annotation);

if (annotation.processor < gpu::LaunchOp::kNumConfigOperands) {		if (isMappedToProcessor(processor)) {
// Use the corresponding thread/grid index as replacement for the loop iv.		// Use the corresponding thread/grid index as replacement for the loop iv.
Value operand = launchOp.body().front().getArgument(annotation.processor);		Value operand = launchOp.body().front().getArgument(
		getLaunchOpArgumentNum(processor));
// Take the indexmap and add the lower bound and step computations in.		// Take the indexmap and add the lower bound and step computations in.
// This computes operand * step + lowerBound.		// This computes operand * step + lowerBound.
// Use an affine map here so that it composes nicely with the provided		// Use an affine map here so that it composes nicely with the provided
// annotation.		// annotation.
AffineMap lowerAndStep = AffineMap::get(		AffineMap lowerAndStep = AffineMap::get(
1, 2,		1, 2,
rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +		rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
rewriter.getAffineSymbolExpr(1));		rewriter.getAffineSymbolExpr(1));
newIndex = rewriter.create<AffineApplyOp>(		newIndex = rewriter.create<AffineApplyOp>(
loc, annotation.indexMap.compose(lowerAndStep),		loc, annotation.map().getValue().compose(lowerAndStep),
ValueRange{operand, step, lowerBound});		ValueRange{operand, step, lowerBound});
// If there was also a bound, insert that, too.		// If there was also a bound, insert that, too.
// TODO(herhut): Check that we do not assign bounds twice.		// TODO(herhut): Check that we do not assign bounds twice.
if (annotation.boundMap) {		if (annotation.bound().getValue()) {
// We pass as the single opererand to the bound-map the number of		// We pass as the single opererand to the bound-map the number of
// iterations, which is (upperBound - lowerBound) ceilDiv step. To		// iterations, which is (upperBound - lowerBound) ceilDiv step. To
// support inner loops with dynamic upper bounds (as generated by e.g.		// support inner loops with dynamic upper bounds (as generated by e.g.
// tiling), try to derive a max for the bounds. If the used bound for		// tiling), try to derive a max for the bounds. If the used bound for
// the hardware id is imprecise, wrap the contained code into a		// the hardware id is imprecise, wrap the contained code into a
// conditional. If the lower-bound is constant or defined before the		// conditional. If the lower-bound is constant or defined before the
// launch, we can use it in the launch bounds. Otherwise fail.		// launch, we can use it in the launch bounds. Otherwise fail.
if (!launchIndependent(lowerBound) &&		if (!launchIndependent(lowerBound) &&
Show All 23 Lines	if (isMappedToProcessor(processor)) {
// affine expression ceilDiv (upperBound - lowerBound) step. We use		// affine expression ceilDiv (upperBound - lowerBound) step. We use
// affine.apply here so that it composes nicely with the provided map.		// affine.apply here so that it composes nicely with the provided map.
AffineMap stepMap =		AffineMap stepMap =
AffineMap::get(0, 3,		AffineMap::get(0, 3,
((rewriter.getAffineSymbolExpr(0) -		((rewriter.getAffineSymbolExpr(0) -
rewriter.getAffineSymbolExpr(1))		rewriter.getAffineSymbolExpr(1))
.ceilDiv(rewriter.getAffineSymbolExpr(2))));		.ceilDiv(rewriter.getAffineSymbolExpr(2))));
Value launchBound = rewriter.create<AffineApplyOp>(		Value launchBound = rewriter.create<AffineApplyOp>(
loc, annotation.boundMap.compose(stepMap),		loc, annotation.bound().getValue().compose(stepMap),
ValueRange{		ValueRange{
ensureLaunchIndependent(		ensureLaunchIndependent(
cloningMap.lookupOrDefault(upperBound)),		cloningMap.lookupOrDefault(upperBound)),
ensureLaunchIndependent(		ensureLaunchIndependent(
cloningMap.lookupOrDefault(lowerBound)),		cloningMap.lookupOrDefault(lowerBound)),
ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});		ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
if (bounds.find(annotation.processor) != bounds.end()) {		// todo(herhut,ravishankarm): Update the behavior of setMappingAttr
		// when this condition is relaxed.
		if (bounds.find(processor) != bounds.end()) {
return parallelOp.emitOpError()		return parallelOp.emitOpError()
<< "cannot redefine the bound for processor "		<< "cannot redefine the bound for processor "
<< annotation.processor;		<< static_cast<int64_t>(processor);
}		}
bounds[annotation.processor] = launchBound;		bounds[processor] = launchBound;
}		}
if (!boundIsPrecise) {		if (!boundIsPrecise) {
// We are using an approximation, create a surrounding conditional.		// We are using an approximation, create a surrounding conditional.
Value originalBound = std::get<3>(config);		Value originalBound = std::get<3>(config);
CmpIOp pred = rewriter.create<CmpIOp>(		CmpIOp pred = rewriter.create<CmpIOp>(
loc, CmpIPredicate::slt, newIndex,		loc, CmpIPredicate::slt, newIndex,
cloningMap.lookupOrDefault(originalBound));		cloningMap.lookupOrDefault(originalBound));
loop::IfOp ifOp = rewriter.create<loop::IfOp>(loc, pred, false);		loop::IfOp ifOp = rewriter.create<loop::IfOp>(loc, pred, false);
▲ Show 20 Lines • Show All 65 Lines • ▼ Show 20 Lines	ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(		gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(
parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne,		parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne,
constantOne, constantOne);		constantOne, constantOne);
rewriter.setInsertionPointToEnd(&launchOp.body().front());		rewriter.setInsertionPointToEnd(&launchOp.body().front());
rewriter.create<gpu::TerminatorOp>(loc);		rewriter.create<gpu::TerminatorOp>(loc);
rewriter.setInsertionPointToStart(&launchOp.body().front());		rewriter.setInsertionPointToStart(&launchOp.body().front());

BlockAndValueMapping cloningMap;		BlockAndValueMapping cloningMap;
llvm::DenseMap<int, Value> launchBounds;		llvm::DenseMap<gpu::Processor, Value> launchBounds;
SmallVector<Operation *, 16> worklist;		SmallVector<Operation *, 16> worklist;
if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,		if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
launchBounds, rewriter)))		launchBounds, rewriter)))
return failure();		return failure();

// Whether we have seen any side-effects. Reset when leaving an inner scope.		// Whether we have seen any side-effects. Reset when leaving an inner scope.
bool seenSideeffects = false;		bool seenSideeffects = false;
// Whether we have left a nesting scope (and hence are no longer innermost).		// Whether we have left a nesting scope (and hence are no longer innermost).
Show All 35 Lines	if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
if (seenSideeffects && leftNestingScope)		if (seenSideeffects && leftNestingScope)
return failure();		return failure();
}		}
}		}

// Now that we succeeded creating the launch operation, also update the		// Now that we succeeded creating the launch operation, also update the
// bounds.		// bounds.
for (auto bound : launchBounds)		for (auto bound : launchBounds)
launchOp.setOperand(std::get<0>(bound), std::get<1>(bound));		launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)),
		std::get<1>(bound));

rewriter.eraseOp(parallelOp);		rewriter.eraseOp(parallelOp);
return success();		return success();
}		}

void mlir::populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns,		void mlir::populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns,
MLIRContext *ctx) {		MLIRContext *ctx) {
patterns.insert<ParallelToGpuLaunchLowering>(ctx);		patterns.insert<ParallelToGpuLaunchLowering>(ctx);
}		}

mlir/lib/Dialect/GPU/CMakeLists.txt

	add_mlir_dialect_library(MLIRGPU			add_mlir_dialect_library(MLIRGPU
	IR/GPUDialect.cpp			IR/GPUDialect.cpp
	Transforms/AllReduceLowering.cpp			Transforms/AllReduceLowering.cpp
	Transforms/KernelOutlining.cpp			Transforms/KernelOutlining.cpp
	Transforms/MemoryPromotion.cpp			Transforms/MemoryPromotion.cpp
	Transforms/ParallelLoopMapper.cpp			Transforms/ParallelLoopMapper.cpp

	ADDITIONAL_HEADER_DIRS			ADDITIONAL_HEADER_DIRS
	${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU			${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU

	DEPENDS			DEPENDS
	MLIRGPUOpsIncGen			MLIRGPUOpsIncGen
				MLIRParallelLoopMapperAttrGen
				MLIRParallelLoopMapperEnumsGen
	)			)
	target_link_libraries(MLIRGPU			target_link_libraries(MLIRGPU
	PUBLIC			PUBLIC
	MLIREDSC			MLIREDSC
	MLIRIR			MLIRIR
	MLIRLLVMIR			MLIRLLVMIR
	MLIRLoopOps			MLIRLoopOps
	MLIRPass			MLIRPass
	MLIRSideEffects			MLIRSideEffects
	MLIRStandardOps			MLIRStandardOps
	MLIRSupport			MLIRSupport
	MLIRTransformUtils			MLIRTransformUtils
	LLVMSupport			LLVMSupport
	)			)

mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp

	Show All 17 Lines
	#include "mlir/Dialect/LoopOps/LoopOps.h"			#include "mlir/Dialect/LoopOps/LoopOps.h"
	#include "mlir/IR/AffineMap.h"			#include "mlir/IR/AffineMap.h"
	#include "mlir/Pass/Pass.h"			#include "mlir/Pass/Pass.h"

	using namespace mlir;			using namespace mlir;
	using namespace mlir::gpu;			using namespace mlir::gpu;
	using namespace mlir::loop;			using namespace mlir::loop;

				#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.cpp.inc"
				namespace mlir {

				#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.cpp.inc"
				namespace gpu {

				StringRef getMappingAttrName() { return "mapping"; }

				ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor,
				AffineMap map,
				AffineMap bound) {
				MLIRContext *context = map.getContext();
				OpBuilder builder(context);
				return ParallelLoopDimMapping::get(
				builder.getI64IntegerAttr(static_cast<int32_t>(processor)),
				AffineMapAttr::get(map), AffineMapAttr::get(bound), context);
				}

				LogicalResult setMappingAttr(loop::ParallelOp ploopOp,
				ArrayRef<ParallelLoopDimMapping> mapping) {
				// Verify that each processor is mapped to only once.
				llvm::DenseSet<gpu::Processor> specifiedMappings;
				for (auto dimAttr : mapping) {
				gpu::Processor processor = getProcessor(dimAttr);
				if (processor != gpu::Processor::Sequential &&
				ftynseUnsubmitted Done Reply Inline Actions You have `getProcessor` for this ftynse: You have `getProcessor` for this
				mravishankarAuthorUnsubmitted Done Reply Inline Actions Oh yes! Thanks! mravishankar: Oh yes! Thanks!
				specifiedMappings.count(processor))
				herhutUnsubmitted Done Reply Inline Actions Why is this invalid? You could map to the same processor but use `ceilDiv` and `modulo` in the `map` attribute to decompose the bound again. herhut: Why is this invalid? You could map to the same processor but use `ceilDiv` and `modulo` in the…
				mravishankarAuthorUnsubmitted Done Reply Inline Actions I was going by what is here : https://github.com/llvm/llvm-project/blob/5c261c9c452959985de19540c168b224af24e2d3/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp#L673. Sorry if I misread it mravishankar: I was going by what is here : https://github.com/llvm/llvm…
				return ploopOp.emitError(
				"invalid mapping multiple loops to same processor");
				}
				ArrayRef<Attribute> mappingAsAttrs(mapping.data(), mapping.size());
				ploopOp.setAttr(getMappingAttrName(),
				ArrayAttr::get(mappingAsAttrs, ploopOp.getContext()));
				return success();
				}
				} // namespace gpu
				} // namespace mlir

	namespace {			namespace {

	enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };			enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };

	static constexpr int kNumHardwareIds = 3;			static constexpr int kNumHardwareIds = 3;

	} // namespace			} // namespace

	/// Bounded increment on MappingLevel. Increments to the next			/// Bounded increment on MappingLevel. Increments to the next
	/// level unless Sequential was already reached.			/// level unless Sequential was already reached.
	MappingLevel &operator++(MappingLevel &mappingLevel) {			MappingLevel &operator++(MappingLevel &mappingLevel) {
	if (mappingLevel < Sequential) {			if (mappingLevel < Sequential) {
	mappingLevel = static_cast<MappingLevel>(mappingLevel + 1);			mappingLevel = static_cast<MappingLevel>(mappingLevel + 1);
	}			}
	return mappingLevel;			return mappingLevel;
	}			}

	/// Computed the hardware id to use for a given mapping level. Will			/// Computed the hardware id to use for a given mapping level. Will
	/// assign x,y and z hardware ids for the first 3 dimensions and use			/// assign x,y and z hardware ids for the first 3 dimensions and use
	/// sequential after.			/// sequential after.
	static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) {			/// TODO(ravishankarm/herhut) : Make this use x for the inner-most loop that is
				/// distributed to map to x, the next innermost to y and the next innermost to
				/// z.
				static gpu::Processor getHardwareIdForMapping(MappingLevel level,
				int dimension) {

				herhutUnsubmitted Done Reply Inline Actions I do not understand this comment. herhut: I do not understand this comment.
				mravishankarAuthorUnsubmitted Done Reply Inline Actions Maybe this comment is misplaced (I should move it below). At line 136 below, the 0th-induction variable is mapped to processor x, and 1th-induction variable to processor y, etc. Typically the 0th induction variable is the "outer" parallel loop. The 1th induction variable is the next inner, etc. There is no strong reason for it, but typically the inner-most parallel loop is also used to access the data in stride 1 (in elementwise operations for example). So a default mapping can just try to handle this common case. There are cases where this doesnt work obviously, and maybe a more general mechanism to control which dimensions maps to which processor dimension is useful. Is that what the mapping and bound are expected to do? mravishankar: Maybe this comment is misplaced (I should move it below). At line 136 below, the 0th-induction…
	if (dimension >= kNumHardwareIds \|\| level == Sequential)			if (dimension >= kNumHardwareIds \|\| level == Sequential)
	return Sequential * kNumHardwareIds;			return Processor::Sequential;
	return (level * kNumHardwareIds) + dimension;			switch (level) {
				case MapGrid:
				switch (dimension) {
				case 0:
				return Processor::BlockX;
				case 1:
				return Processor::BlockY;
				case 2:
				return Processor::BlockZ;
				default:
				return Processor::Sequential;
				}
				break;
				case MapBlock:
				switch (dimension) {
				case 0:
				return Processor::ThreadX;
				case 1:
				return Processor::ThreadY;
				case 2:
				return Processor::ThreadZ;
				default:
				return Processor::Sequential;
				}
				default:;
				}
				return Processor::Sequential;
	}			}

	/// Add mapping information to the given parallel loop. Do not add			/// Add mapping information to the given parallel loop. Do not add
	/// mapping information if the loop already has it. Also, don't			/// mapping information if the loop already has it. Also, don't
	/// start a mapping at a nested loop.			/// start a mapping at a nested loop.
	static void mapParallelOp(ParallelOp parallelOp,			static void mapParallelOp(ParallelOp parallelOp,
	MappingLevel mappingLevel = MapGrid) {			MappingLevel mappingLevel = MapGrid) {
	// Do not try to add a mapping to already mapped loops or nested loops.			// Do not try to add a mapping to already mapped loops or nested loops.
	if (parallelOp.getAttr(gpu::kMappingAttributeName) \|\|			if (parallelOp.getAttr(getMappingAttrName()) \|\|
	((mappingLevel == MapGrid) && parallelOp.getParentOfType<ParallelOp>()))			((mappingLevel == MapGrid) && parallelOp.getParentOfType<ParallelOp>()))
	return;			return;

	MLIRContext *ctx = parallelOp.getContext();			MLIRContext *ctx = parallelOp.getContext();
	Builder b(ctx);			Builder b(ctx);
	SmallVector<Attribute, 4> attrs;			SmallVector<ParallelLoopDimMapping, 4> attrs;
	attrs.reserve(parallelOp.getNumInductionVars());			attrs.reserve(parallelOp.getNumInductionVars());
	for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) {			for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) {
	SmallVector<NamedAttribute, 3> entries;			attrs.push_back(getParallelLoopDimMappingAttr(
	entries.emplace_back(b.getNamedAttr(			getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(),
	kProcessorEntryName,			b.getDimIdentityMap()));
	b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i))));
	entries.emplace_back(b.getNamedAttr(
	kIndexMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
	entries.emplace_back(b.getNamedAttr(
	kBoundMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
	attrs.push_back(DictionaryAttr::get(entries, ctx));
	}			}
	parallelOp.setAttr(kMappingAttributeName, ArrayAttr::get(attrs, ctx));			setMappingAttr(parallelOp, attrs);
	++mappingLevel;			++mappingLevel;
	// Parallel loop operations are immediately nested, so do not use			// Parallel loop operations are immediately nested, so do not use
	// walk but just iterate over the operations.			// walk but just iterate over the operations.
	for (Operation &op : *parallelOp.getBody()) {			for (Operation &op : *parallelOp.getBody()) {
	if (ParallelOp nested = dyn_cast<ParallelOp>(op))			if (ParallelOp nested = dyn_cast<ParallelOp>(op))
	mapParallelOp(nested, mappingLevel);			mapParallelOp(nested, mappingLevel);
	}			}
	}			}

	void mlir::greedilyMapParallelLoopsToGPU(Region &region) {			void mlir::greedilyMapParallelLoopsToGPU(Region &region) {
	region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });			region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
	}			}

mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir

	// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file -verify-diagnostics %s \| FileCheck %s -dump-input-on-failure			// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file -verify-diagnostics %s \| FileCheck %s -dump-input-on-failure

	// 2-d parallel loop mapped to block.y and block.x			// 2-d parallel loop mapped to block.y and block.x

	func @parallel_loop_bidy_bidx(%arg0 : index, %arg1 : index, %arg2 : index,			func @parallel_loop_bidy_bidx(%arg0 : index, %arg1 : index, %arg2 : index,
	%arg3 : index, %arg4 : index,			%arg3 : index, %arg4 : index,
	%buf : memref<?x?xf32>,			%buf : memref<?x?xf32>,
	%res : memref<?x?xf32>) {			%res : memref<?x?xf32>) {
	%step = constant 2 : index			%step = constant 2 : index
	loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)			loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
	step (%arg4, %step) {			step (%arg4, %step) {
	%val = load %buf[%i0, %i1] : memref<?x?xf32>			%val = load %buf[%i0, %i1] : memref<?x?xf32>
	store %val, %res[%i1, %i0] : memref<?x?xf32>			store %val, %res[%i1, %i0] : memref<?x?xf32>
	} { mapping = [{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, {processor = 0, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}] }			} { mapping = [{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, {processor = 0, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}] }
	▲ Show 20 Lines • Show All 289 Lines • ▼ Show 20 Lines

	// Mapping to the same processor twice.			// Mapping to the same processor twice.

	func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,			func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,
	%arg3 : index,			%arg3 : index,
	%buf : memref<?x?xf32>,			%buf : memref<?x?xf32>,
	%res : memref<?x?xf32>) {			%res : memref<?x?xf32>) {
	%four = constant 4 : index			%four = constant 4 : index
	// expected-error@+2 {{cannot redefine the bound for processor 1}}			// expected-error@+2 {{cannot redefine the bound for processor 1}}
				herhutUnsubmitted Done Reply Inline Actions Now the error message no longer corresponds to what is encoded in the attribute. If I search for `BLOCKY` in the input I would not find it. herhut: Now the error message no longer corresponds to what is encoded in the attribute. If I search…
				mravishankarAuthorUnsubmitted Done Reply Inline Actions I think this is an issue with the parsing of custom StructAttr while roundtripping. Its implemented as an IntegerAttr. So while printing and parsing it will use the enum value. If you adapt the loop.parallel parser/printer to custom parse the mapping attribute you can make it accept keywords like "BLOCKX", "BLOCKY", etc. and print it out that way as well. Then the error message here would match up well. For now I reverted the error message to print it as it was before. I can adapt the parser/printer to rountrip the processor value as something more meaningful. (I can add that to this change itself, but it would be a breaking change and not an NFC). Your call. mravishankar: I think this is an issue with the parsing of custom StructAttr while roundtripping. Its…
	// expected-error@+1 {{failed to legalize operation 'loop.parallel'}}			// expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
	loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)			loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
	step (%four, %four) {			step (%four, %four) {
	} { mapping = [			} { mapping = [
	{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},			{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
	{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}			{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
	] }			] }
	return			return
	}			}

	// -----			// -----

	// Loop with loop-variant upper bound.			// Loop with loop-variant upper bound.

	func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index,			func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index,
	%arg3 : index,			%arg3 : index,
	%buf : memref<?x?xf32>,			%buf : memref<?x?xf32>,
	%res : memref<?x?xf32>) {			%res : memref<?x?xf32>) {
	%zero = constant 0 : index			%zero = constant 0 : index
	%one = constant 1 : index			%one = constant 1 : index
	%four = constant 4 : index			%four = constant 4 : index
	// expected-error@+1 {{failed to legalize operation 'loop.parallel'}}			// expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
	loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)			loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
	step (%four, %four) {			step (%four, %four) {
	// expected-error@+1 {{cannot derive loop-invariant upper bound}}			// expected-error@+1 {{cannot derive loop-invariant upper bound}}
	loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)			loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
	step (%one, %one) {			step (%one, %one) {
	%idx0 = addi %i0, %si0 : index			%idx0 = addi %i0, %si0 : index
	%idx1 = addi %i1, %si1 : index			%idx1 = addi %i1, %si1 : index
	%val = load %buf[%idx0, %idx1] : memref<?x?xf32>			%val = load %buf[%idx0, %idx1] : memref<?x?xf32>
	store %val, %res[%idx1, %idx0] : memref<?x?xf32>			store %val, %res[%idx1, %idx0] : memref<?x?xf32>
	} { mapping = [			} { mapping = [
	{processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},			{processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
	{processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}			{processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
	] }			] }
	} { mapping = [			} { mapping = [
	{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},			{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
	{processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}			{processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
	] }			] }
	return			return
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][GPU] Use StructAttr to drive lowering from loop.parallel to gpu.launch
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 252355

mlir/include/mlir/Dialect/GPU/CMakeLists.txt

mlir/include/mlir/Dialect/GPU/GPUBase.td

mlir/include/mlir/Dialect/GPU/GPUOps.td

mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h

mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td

mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp

mlir/lib/Dialect/GPU/CMakeLists.txt

mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp

mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][GPU] Use StructAttr to drive lowering from loop.parallel to gpu.launchClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 252355

mlir/include/mlir/Dialect/GPU/CMakeLists.txt

mlir/include/mlir/Dialect/GPU/GPUBase.td

mlir/include/mlir/Dialect/GPU/GPUOps.td

mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h

mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td

mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp

mlir/lib/Dialect/GPU/CMakeLists.txt

mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp

mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir

[mlir][GPU] Use StructAttr to drive lowering from loop.parallel to gpu.launch
ClosedPublic