Diff 433327

mlir/include/mlir/Dialect/SCF/SCF.h

	//===- SCFOps.h - Structured Control Flow ------------------------ C++ --===//			//===- SCFOps.h - Structured Control Flow ------------------------ C++ --===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	// This file defines structured control flow operations.			// This file defines structured control flow operations.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef MLIR_DIALECT_SCF_SCF_H			#ifndef MLIR_DIALECT_SCF_SCF_H
	#define MLIR_DIALECT_SCF_SCF_H			#define MLIR_DIALECT_SCF_SCF_H

	#include "mlir/IR/Builders.h"			#include "mlir/IR/Builders.h"
				#include "mlir/IR/BuiltinTypes.h"
				#include "mlir/IR/RegionKindInterface.h"
	#include "mlir/Interfaces/ControlFlowInterfaces.h"			#include "mlir/Interfaces/ControlFlowInterfaces.h"
	#include "mlir/Interfaces/LoopLikeInterface.h"			#include "mlir/Interfaces/LoopLikeInterface.h"
	#include "mlir/Interfaces/SideEffectInterfaces.h"			#include "mlir/Interfaces/SideEffectInterfaces.h"
				#include "mlir/Interfaces/ViewLikeInterface.h"

	namespace mlir {			namespace mlir {
	namespace scf {			namespace scf {
	void buildTerminatedBody(OpBuilder &builder, Location loc);			void buildTerminatedBody(OpBuilder &builder, Location loc);
	} // namespace scf			} // namespace scf
	} // namespace mlir			} // namespace mlir

	#include "mlir/Dialect/SCF/SCFOpsDialect.h.inc"			#include "mlir/Dialect/SCF/SCFOpsDialect.h.inc"
	▲ Show 20 Lines • Show All 70 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/SCF/SCFOps.td

Show All 9 Lines

// //

//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

#ifndef MLIR_DIALECT_SCF_SCFOPS #ifndef MLIR_DIALECT_SCF_SCFOPS

#define MLIR_DIALECT_SCF_SCFOPS #define MLIR_DIALECT_SCF_SCFOPS

include "mlir/Interfaces/ControlFlowInterfaces.td" include "mlir/Interfaces/ControlFlowInterfaces.td"

include "mlir/Interfaces/LoopLikeInterface.td" include "mlir/Interfaces/LoopLikeInterface.td"

include "mlir/IR/RegionKindInterface.td"

include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Interfaces/SideEffectInterfaces.td"

include "mlir/Interfaces/ViewLikeInterface.td"

def SCF_Dialect : Dialect { def SCF_Dialect : Dialect {

let name = "scf"; let name = "scf";

let cppNamespace = "::mlir::scf"; let cppNamespace = "::mlir::scf";

let dependentDialects = ["arith::ArithmeticDialect"]; let dependentDialects = ["arith::ArithmeticDialect"];

let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed; let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;

} }

▲ Show 20 Lines • Show All 280 Lines • ▼ Show 20 Lines def ForOp : SCF_Op<"for",

}]; }];

let hasCanonicalizer = 1; let hasCanonicalizer = 1;

let hasCustomAssemblyFormat = 1; let hasCustomAssemblyFormat = 1;

let hasVerifier = 1; let hasVerifier = 1;

let hasRegionVerifier = 1; let hasRegionVerifier = 1;

} }

//===----------------------------------------------------------------------===//

// ForeachThreadOp

//===----------------------------------------------------------------------===//

def ForeachThreadOp : SCF_Op<"foreach_thread", [

SingleBlockImplicitTerminator<"scf::PerformConcurrentlyOp">,

RecursiveSideEffects,

AutomaticAllocationScope,

]> {

let summary = "evaluate a block multiple times in parallel";

let description = [{

`scf.foreach_thread` is a target-independent multi-dimensional parallel

region application operation. It has exactly one block that represents the

ftynseUnsubmitted

Done

Nit: "function" is confusing here since the body is not isolated from above as a function would be and only has one block while the function would have a CFG.

ftynse: Nit: "function" is confusing here since the body is not isolated from above as a function would…

parallel body and it takes index operands that indicate how many parallel

instances of that function are created.

The name "thread" conveys the fact that the parallel execution is mapped

(i.e. distributed) to a set of virtual threads of execution, one function

application per thread. Further lowerings are responsible for specifying

how this is materialized on concrete hardware resources.

The only allowed terminator is `scf.foreach_thread.perform_concurrently`,

which dictates how the partial results of all parallel invocations should be

reconciled into a full value.

`scf.foreach_thread` returns values that are formed by aggregating the

ftynseUnsubmitted

Done

It isn't clear how exactly the values are formed given that parallel_insert_slice doesn't actually return anything and neither does the terminator. Maybe put a reference to the terminator documentation here.

ftynse: It isn't clear how exactly the values are formed given that `parallel_insert_slice` doesn't…

actions of all the `perform_concurrently` terminator of all the virtual

ftynseUnsubmitted

Done

`scf.foreach_thread` returns values that are formed by aggregating the

- actions of all the `perform_concurrently` terminator of all the virtual

+ actions of all the `perform_concurrently` terminators of all the virtual

threads, in some unspecified order.

ftynse:

threads, in some unspecified order.

In other words, `scf.foreach_thread` performs all actions specified in the

`perform_concurrently` terminator, after it receives the control back from

its body along each virtual thread of execution.

The actions involved in constructing the return values are further described

by [parallel_insert_slice](#parallelinsertslice-parallelinsertsliceop).

`scf.foreach_thread` acts as an implicit synchronization point.

Multi-value returns are encoded by including multiple operations inside the

`perform_concurrently` block.

When the parallel function body has side effects, the order of reads and

writes to memory is unspecified across threads.

Example:

```

// Sequential context.

%matmul_and_pointwise:2 = scf.foreach_thread (%thread_id_1, %thread_id_2) in

(%num_threads_1, %numthread_id_2) -> (tensor<?x?xT>, tensor<?xT>) {

// Parallel context, each thread with id = (%thread_id_1, %thread_id_2)

// runs its version of the code.

%sA = tensor.extract_slice %A[f((%thread_id_1, %thread_id_2))]:

tensor<?x?xT> to tensor<?x?xT>

%sB = tensor.extract_slice %B[g((%thread_id_1, %thread_id_2))]:

tensor<?x?xT> to tensor<?x?xT>

%sC = tensor.extract_slice %C[h((%thread_id_1, %thread_id_2))]:

tensor<?x?xT> to tensor<?x?xT>

%sD = matmul ins(%sA, %sB) outs(%sC)

%spointwise = subtensor %pointwise[i((%thread_id_1, %thread_id_2))]:

tensor<?xT> to tensor<?xT>

%sE = add ins(%spointwise) outs(%sD)

scf.foreach_thread.perform_concurrently {

// First op within the parallel terminator contributes to producing %matmul_and_pointwise#0.

scf.foreach_thread.parallel_insert_slice %sD into %C[h((%thread_id_1, %thread_id_2))]:

tensor<?x?xT> into tensor<?x?xT>

// Second op within the parallel terminator contributes to producing %matmul_and_pointwise#1.

scf.foreach_thread.parallel_insert_slice %spointwise into %pointwise[i((%thread_id_1, %thread_id_2))]:

tensor<?xT> into tensor<?xT>

}

// Implicit synchronization point.

// Sequential context.

```

}];

let arguments = (ins Variadic<Index>:$num_threads);

let results = (outs Variadic<AnyType>:$results);

let regions = (region SizedRegion<1>:$region);

let hasCustomAssemblyFormat = 1;

let hasVerifier = 1;

// The default builder does not add the proper body BBargs, roll our own.

let skipDefaultBuilders = 1;

let builders = [

// Bodyless builder, result types must be specified.

OpBuilder<(ins "TypeRange":$resultTypes, "ValueRange":$num_threads)>,

// Builder that takes a bodyBuilder lambda, result types are inferred from

// the terminator.

OpBuilder<(ins "ValueRange":$num_threads,

"function_ref<void(OpBuilder &, Location, ValueRange)>":$bodyBuilder)>

];

let extraClassDeclaration = [{

int64_t getRank() { return getNumThreads().size(); }

::mlir::ValueRange getThreadIndices() { return getBody()->getArguments(); }

::mlir::Value getThreadIndex(int64_t idx) { return getBody()->getArgument(idx); }

// The ensureTerminator method generated by SingleBlockImplicitTerminator is

// unaware of the fact that our terminator also needs a region to be

// well-formed. We override it here to ensure that we do the right thing.

static void ensureTerminator(Region &region, OpBuilder &builder, Location loc);

PerformConcurrentlyOp getTerminator();

}];

}

def PerformConcurrentlyOp : SCF_Op<"foreach_thread.perform_concurrently", [

NoSideEffect,

Terminator,

HasParent<"ForeachThreadOp">,

] # GraphRegionNoTerminator.traits> {

let summary = "terminates a `foreach_thread` block";

let description = [{

`scf.foreach_thread.perform_concurrently` is a designated terminator for

the `scf.foreach_thread` operation.

It has a single region with a single block that contains a flat list of ops.

Each such op participates in the aggregate formation of a single result of

the enclosing `scf.foreach_thread`.

The result number corresponds to the position of the op in the terminator.

}];

let regions = (region SizedRegion<1>:$region);

let hasCustomAssemblyFormat = 1;

let hasVerifier = 1;

// TODO: Add a `PerformConcurrentlyOpInterface` interface for ops that can

ftynseUnsubmitted

Done

Can't this be something like llvm::iterator_range<Block::iterator>?

ftynse: Can't this be something like `llvm::iterator_range<Block::iterator>`?

// appear inside perform_concurrently.

let extraClassDeclaration = [{

SmallVector<Type> yieldedTypes();

::llvm::iterator_range<Block::iterator> yieldingOps();

}];

}

// TODO: Implement PerformConcurrentlyOpInterface.

def ParallelInsertSliceOp : SCF_Op<"foreach_thread.parallel_insert_slice", [

AttrSizedOperandSegments,

OffsetSizeAndStrideOpInterface,

ftynseUnsubmitted

Done

We can just mark PerformConcurrentlyOp as not having a terminator at all, similarly to how ModuleOp does not. This will remove the need for this operation, and likely simplify some other logic.

ftynse: We can just mark `PerformConcurrentlyOp` as not having a terminator at all, similarly to how…

HasParent<"PerformConcurrentlyOp">]> {

let summary = [{

Specify the tensor slice update of a single thread within the terminator of

an `scf.foreach_thread`.

}];

let description = [{

ftynseUnsubmitted

Done

Nit: let's not commit commented-out code.

ftynse: Nit: let's not commit commented-out code.

The parent `scf.foreach_thread` returns values that are formed by aggregating

the actions of all the ops contained within the `perform_concurrently`

terminator of all the threads, in some unspecified order.

The `scf.foreach_thread.parallel_insert_slice` is one such op allowed in

the `scf.foreach_thread.perform_concurrently` terminator.

Conflicting writes result in undefined semantics, in that the indices written

to by multiple parallel updates might contain data from any of the updates, or

even a malformed bit pattern.

If an index is updated exactly once, the value contained at that index

in the resulting tensor will be equal to the value at a corresponding index of a

slice that was used for the updated. If an index is not updated at all, its value

will be equal to the one in the original tensor.

This op does not create a new value, which allows maintaining a clean

separation between the subset and full tensor.

ftynseUnsubmitted

Done

even a malformed bit pattern.

- If an index is updated by exactly one updates, the value contained at that index

+ If an index is updated exactly once, the value contained at that index

in the resulting tensor will be equal to the value at a corresponding index of a

ftynse:

Note that we cannot mark this operation as pure (NoSideEffects), even

though it has no side effects, because it will get DCEd during

canonicalization.

}];

let arguments = (ins

AnyRankedTensor:$source,

AnyRankedTensor:$dest,

Variadic<Index>:$offsets,

Variadic<Index>:$sizes,

Variadic<Index>:$strides,

I64ArrayAttr:$static_offsets,

I64ArrayAttr:$static_sizes,

I64ArrayAttr:$static_strides

);

let assemblyFormat = [{

$source `into` $dest ``

custom<OperandsOrIntegersOffsetsOrStridesList>($offsets, $static_offsets)

custom<OperandsOrIntegersSizesList>($sizes, $static_sizes)

custom<OperandsOrIntegersOffsetsOrStridesList>($strides, $static_strides)

attr-dict `:` type($source) `into` type($dest)

}];

let extraClassDeclaration = [{

::mlir::Operation::operand_range offsets() { return getOffsets(); }

::mlir::Operation::operand_range sizes() { return getSizes(); }

::mlir::Operation::operand_range strides() { return getStrides(); }

::mlir::ArrayAttr static_offsets() { return getStaticOffsets(); }

::mlir::ArrayAttr static_sizes() { return getStaticSizes(); }

::mlir::ArrayAttr static_strides() { return getStaticStrides(); }

Type yieldedType() { return getDest().getType(); }

RankedTensorType getSourceType() {

return getSource().getType().cast<RankedTensorType>();

}

ftynseUnsubmitted

Done

Let's add the namespace prefix systematically

ftynse: Let's add the namespace prefix systematically

/// Return the expected rank of each of the `static_offsets`, `static_sizes`

/// and `static_strides` attributes.

std::array<unsigned, 3> getArrayAttrMaxRanks() {

unsigned rank = getSourceType().getRank();

return {rank, rank, rank};

}

/// Return the number of leading operands before `offsets`, `sizes` and

/// `strides` operands.

static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }

}];

let builders = [

// Build a ParallelInsertSliceOp with mixed static and dynamic entries.

OpBuilder<(ins "Value":$source, "Value":$dest,

"ArrayRef<OpFoldResult>":$offsets, "ArrayRef<OpFoldResult>":$sizes,

"ArrayRef<OpFoldResult>":$strides,

CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,

// Build a ParallelInsertSliceOp with dynamic entries.

OpBuilder<(ins "Value":$source, "Value":$dest,

"ValueRange":$offsets, "ValueRange":$sizes, "ValueRange":$strides,

CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>

];

}

//===----------------------------------------------------------------------===//

// IfOp

//===----------------------------------------------------------------------===//

def IfOp : SCF_Op<"if", def IfOp : SCF_Op<"if",

[DeclareOpInterfaceMethods<RegionBranchOpInterface, [DeclareOpInterfaceMethods<RegionBranchOpInterface,

["getNumRegionInvocations", ["getNumRegionInvocations",

"getRegionInvocationBounds"]>, "getRegionInvocationBounds"]>,

SingleBlockImplicitTerminator<"scf::YieldOp">, RecursiveSideEffects, SingleBlockImplicitTerminator<"scf::YieldOp">, RecursiveSideEffects,

NoRegionArguments]> { NoRegionArguments]> {

let summary = "if-then-else operation"; let summary = "if-then-else operation";

let description = [{ let description = [{

▲ Show 20 Lines • Show All 397 Lines • Show Last 20 Lines

mlir/lib/Dialect/SCF/SCF.cpp

	//===- SCF.cpp - Structured Control Flow Operations -----------------------===//			//===- SCF.cpp - Structured Control Flow Operations -----------------------===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#include "mlir/Dialect/SCF/SCF.h"			#include "mlir/Dialect/SCF/SCF.h"
	#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"			#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
				#include "mlir/Dialect/Arithmetic/Utils/Utils.h"
	#include "mlir/Dialect/Bufferization/IR/Bufferization.h"			#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
	#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"			#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
	#include "mlir/Dialect/MemRef/IR/MemRef.h"			#include "mlir/Dialect/MemRef/IR/MemRef.h"
	#include "mlir/Dialect/Tensor/IR/Tensor.h"			#include "mlir/Dialect/Tensor/IR/Tensor.h"
	#include "mlir/IR/BlockAndValueMapping.h"			#include "mlir/IR/BlockAndValueMapping.h"
	#include "mlir/IR/FunctionInterfaces.h"			#include "mlir/IR/FunctionInterfaces.h"
	#include "mlir/IR/Matchers.h"			#include "mlir/IR/Matchers.h"
	#include "mlir/IR/PatternMatch.h"			#include "mlir/IR/PatternMatch.h"
	▲ Show 20 Lines • Show All 1,021 Lines • ▼ Show 20 Lines

	void ForOp::getCanonicalizationPatterns(RewritePatternSet &results,			void ForOp::getCanonicalizationPatterns(RewritePatternSet &results,
	MLIRContext *context) {			MLIRContext *context) {
	results.add<ForOpIterArgsFolder, SimplifyTrivialLoops,			results.add<ForOpIterArgsFolder, SimplifyTrivialLoops,
	LastTensorLoadCanonicalization, ForOpTensorCastFolder>(context);			LastTensorLoadCanonicalization, ForOpTensorCastFolder>(context);
	}			}

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
				// ForeachThreadOp
				//===----------------------------------------------------------------------===//

				LogicalResult ForeachThreadOp::verify() {
				// Call terminator's verify to produce most informative error messages.
				if (failed(getTerminator().verify()))
				return failure();

				// Check that the body defines as single block argument for the thread index.
				auto *body = getBody();
				if (body->getNumArguments() != getRank())
				ftynseUnsubmitted Done Reply Inline Actions Please add a test for all user-visible error messages. ftynse: Please add a test for all user-visible error messages.
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions not actually produceable, dropped it. nicolasvasilache: not actually produceable, dropped it.
				return emitOpError("region expects ") << getRank() << " arguments";
				ftynseUnsubmitted Done Reply Inline Actions "index type `index`" sounds tautological. ftynse: "index type `index`" sounds tautological.

				// Verify consistency between the result types and the terminator.
				auto terminatorTypes = getTerminator().yieldedTypes();
				auto opResults = getResults();
				if (opResults.size() != terminatorTypes.size())
				return emitOpError("produces ")
				<< opResults.size() << " results, but its terminator yields "
				<< terminatorTypes.size() << " value(s)";
				unsigned i = 0;
				for (auto e : llvm::zip(terminatorTypes, opResults)) {
				if (std::get<0>(e) != std::get<1>(e).getType())
				return emitOpError() << "type mismatch between result " << i << " ("
				ftynseUnsubmitted Done Reply Inline Actions Nit: `type mismatch between the result << i << "(" << ... << " of the terminator` would avoid the awkward `1th result` and `2th result`. ftynse: Nit: `type mismatch between the result << i << "(" << ... << " of the terminator` would avoid…
				<< std::get<1>(e).getType() << ") and terminator ("
				<< std::get<0>(e) << ")";
				i++;
				}
				return success();
				}

				void ForeachThreadOp::print(OpAsmPrinter &p) {
				p << " (";
				llvm::interleaveComma(getThreadIndices(), p);
				p << ") in (";
				llvm::interleaveComma(getNumThreads(), p);
				p << ") -> (" << getResultTypes() << ") ";
				p.printRegion(getRegion(),
				/printEntryBlockArgs=/false,
				/printBlockTerminators=/getNumResults() > 0);
				p.printOptionalAttrDict(getOperation()->getAttrs());
				}

				ParseResult ForeachThreadOp::parse(OpAsmParser &parser,
				OperationState &result) {
				auto &builder = parser.getBuilder();
				// Parse an opening `(` followed by thread index variables followed by `)`
				// TODO: when we can refer to such "induction variable"-like handles from the
				// declarative assembly format, we can implement the parser as a custom hook.
				SmallVector<OpAsmParser::Argument, 4> threadIndices;
				if (parser.parseArgumentList(threadIndices, OpAsmParser::Delimiter::Paren))
				return failure();

				// Parse `in` threadNums.
				SmallVector<OpAsmParser::UnresolvedOperand, 4> threadNums;
				if (parser.parseKeyword("in") \|\|
				parser.parseOperandList(threadNums, threadIndices.size(),
				OpAsmParser::Delimiter::Paren) \|\|
				parser.resolveOperands(threadNums, builder.getIndexType(),
				result.operands))
				return failure();

				// Parse optional results.
				if (parser.parseOptionalArrowTypeList(result.types))
				return failure();

				// Parse region.
				std::unique_ptr<Region> region = std::make_unique<Region>();
				for (auto &idx : threadIndices)
				idx.type = builder.getIndexType();
				if (parser.parseRegion(*region, threadIndices))
				ftynseUnsubmitted Done Reply Inline Actions Can't we just put this part into a custom directive and use declarative assembly for the rest? ftynse: Can't we just put this part into a custom directive and use declarative assembly for the rest?
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions Could not find a way to handle the "induction variable"-like bbargs in the declarative assembly format. In the absence of that, this would result in moving almost all of the code to the custom hook without much benefit. Added a TODO as it would indeed be a nice improvement to use the declarative format. nicolasvasilache: Could not find a way to handle the "induction variable"-like bbargs in the declarative assembly…
				return failure();

				// Ensure terminator and move region.
				OpBuilder b(builder.getContext());
				ForeachThreadOp::ensureTerminator(*region, b, result.location);
				result.addRegion(std::move(region));

				// Parse the optional attribute list.
				if (parser.parseOptionalAttrDict(result.attributes))
				return failure();

				return success();
				}

				// Bodyless builder, result types must be specified.
				void ForeachThreadOp::build(mlir::OpBuilder &builder,
				mlir::OperationState &result, TypeRange resultTypes,
				ValueRange numThreads) {
				result.addOperands(numThreads);

				ftynseUnsubmitted Done Reply Inline Actions Please create a block with `builder`, that's why it is provided.... ftynse: Please create a block with `builder`, that's why it is provided....
				Region *bodyRegion = result.addRegion();
				{
				OpBuilder::InsertionGuard g(builder);
				builder.createBlock(bodyRegion);
				}
				Block &bodyBlock = bodyRegion->front();
				bodyBlock.addArguments(
				SmallVector<Type>(numThreads.size(), builder.getIndexType()),
				SmallVector<Location>(numThreads.size(), result.location));
				ForeachThreadOp::ensureTerminator(*bodyRegion, builder, result.location);
				result.addTypes(resultTypes);
				}

				// Builder that takes a bodyBuilder lambda, result types are inferred from
				// the terminator.
				void ForeachThreadOp::build(
				mlir::OpBuilder &builder, mlir::OperationState &result,
				ValueRange numThreads,
				function_ref<void(OpBuilder &, Location, ValueRange)> bodyBuilder) {
				result.addOperands(numThreads);

				Region *bodyRegion = result.addRegion();
				bodyRegion->push_back(new Block);
				Block &bodyBlock = bodyRegion->front();
				bodyBlock.addArguments(
				SmallVector<Type>(numThreads.size(), builder.getIndexType()),
				SmallVector<Location>(numThreads.size(), result.location));

				OpBuilder::InsertionGuard guard(builder);
				builder.setInsertionPointToStart(&bodyBlock);
				bodyBuilder(builder, result.location, bodyBlock.getArgument(0));
				auto terminator =
				llvm::cast<PerformConcurrentlyOp>(bodyBlock.getTerminator());
				result.addTypes(terminator.yieldedTypes());
				}

				// The ensureTerminator method generated by SingleBlockImplicitTerminator is
				// unaware of the fact that our terminator also needs a region to be
				// well-formed. We override it here to ensure that we do the right thing.
				void ForeachThreadOp::ensureTerminator(Region &region, OpBuilder &builder,
				Location loc) {
				OpTrait::SingleBlockImplicitTerminator<PerformConcurrentlyOp>::Impl<
				ForeachThreadOp>::ensureTerminator(region, builder, loc);
				auto terminator =
				llvm::dyn_cast<PerformConcurrentlyOp>(region.front().getTerminator());
				if (terminator.getRegion().empty())
				builder.createBlock(&terminator.getRegion());
				}

				PerformConcurrentlyOp ForeachThreadOp::getTerminator() {
				return cast<PerformConcurrentlyOp>(getBody()->getTerminator());
				}

				//===----------------------------------------------------------------------===//
				// ParallelInsertSliceOp
				//===----------------------------------------------------------------------===//

				// Build a ParallelInsertSliceOp with mixed static and dynamic entries.
				void ParallelInsertSliceOp::build(OpBuilder &b, OperationState &result,
				Value source, Value dest,
				ArrayRef<OpFoldResult> offsets,
				ArrayRef<OpFoldResult> sizes,
				ArrayRef<OpFoldResult> strides,
				ArrayRef<NamedAttribute> attrs) {
				SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
				SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
				dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets,
				ShapedType::kDynamicStrideOrOffset);
				dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes,
				ShapedType::kDynamicSize);
				dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides,
				ShapedType::kDynamicStrideOrOffset);
				build(b, result, {}, source, dest, dynamicOffsets, dynamicSizes,
				dynamicStrides, b.getI64ArrayAttr(staticOffsets),
				b.getI64ArrayAttr(staticSizes), b.getI64ArrayAttr(staticStrides));
				result.addAttributes(attrs);
				}

				// Build a ParallelInsertSliceOp with dynamic entries.
				void ParallelInsertSliceOp::build(OpBuilder &b, OperationState &result,
				Value source, Value dest, ValueRange offsets,
				ValueRange sizes, ValueRange strides,
				ArrayRef<NamedAttribute> attrs) {
				SmallVector<OpFoldResult> offsetValues = llvm::to_vector<4>(
				llvm::map_range(offsets, [](Value v) -> OpFoldResult { return v; }));
				SmallVector<OpFoldResult> sizeValues = llvm::to_vector<4>(
				llvm::map_range(sizes, [](Value v) -> OpFoldResult { return v; }));
				SmallVector<OpFoldResult> strideValues = llvm::to_vector<4>(
				llvm::map_range(strides, [](Value v) -> OpFoldResult { return v; }));
				build(b, result, source, dest, offsetValues, sizeValues, strideValues);
				}

				//===----------------------------------------------------------------------===//
				// PerformConcurrentlyOp
				//===----------------------------------------------------------------------===//

				LogicalResult PerformConcurrentlyOp::verify() {
				// TODO: PerformConcurrentlyOpInterface.
				for (const Operation &op : getRegion().front().getOperations())
				if (!isa<ParallelInsertSliceOp>(op))
				return emitOpError(
				"expected only scf.foreach_thread.parallel_insert_slice ops");
				return success();
				}

				void PerformConcurrentlyOp::print(OpAsmPrinter &p) {
				p << " ";
				p.printRegion(getRegion(),
				/printEntryBlockArgs=/false,
				/printBlockTerminators=/false);
				p.printOptionalAttrDict(getOperation()->getAttrs());
				}

				ParseResult PerformConcurrentlyOp::parse(OpAsmParser &parser,
				OperationState &result) {
				auto &builder = parser.getBuilder();

				SmallVector<OpAsmParser::Argument, 8> regionOperands;
				std::unique_ptr<Region> region = std::make_unique<Region>();
				if (parser.parseRegion(*region, regionOperands))
				return failure();

				if (region->empty())
				OpBuilder(builder.getContext()).createBlock(region.get());
				result.addRegion(std::move(region));

				// Parse the optional attribute list.
				if (parser.parseOptionalAttrDict(result.attributes))
				return failure();
				return success();
				}

				SmallVector<Type> PerformConcurrentlyOp::yieldedTypes() {
				christopherbateUnsubmitted Not Done Reply Inline Actions @nicolasvasilache Currently this operation assumes that there will be a yielded value for each "parallel_insert_slice" operation. Would it make sense to change this to have a single return type for each unique "dest" value for all the parallel insert slice operations? For example, a distributed "copy" operation could have each thread copying non-contiguous portions of the source to a single destination value. Currently, that would yield a result variable for each portion. christopherbate: @nicolasvasilache Currently this operation assumes that there will be a yielded value for each…
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions This is an interesting take and could make sense, thanks for proposing! I can certainly see the case of multiple piecewise concurrent updates to the same result. Could you elaborate a bit on your use cases? nicolasvasilache: This is an interesting take and could make sense, thanks for proposing! I can certainly see the…
				christopherbateUnsubmitted Not Done Reply Inline Actions Currently I'm working on an operation that represents a 2D view into the source tensor where each row is contiguous but adjacent rows are not contiguous. A subset of the view is copied row-by-row into a destination tensor. I lower the copy to `scf.foreach_thread`, where the number of threads is smaller than the number of rows by some factor. At this time the number of threads and the size of the copy are known statically, so each thread should copy N rows christopherbate: Currently I'm working on an operation that represents a 2D view into the source tensor where…
				return llvm::to_vector<4>(
				llvm::map_range(this->yieldingOps(), [](Operation &op) {
				auto insertSliceOp = dyn_cast<ParallelInsertSliceOp>(&op);
				return insertSliceOp ? insertSliceOp.yieldedType() : Type();
				}));
				}

				llvm::iterator_range<Block::iterator> PerformConcurrentlyOp::yieldingOps() {
				return getRegion().front().getOperations();
				}

				//===----------------------------------------------------------------------===//
	// IfOp			// IfOp
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
				ftynseUnsubmitted Done Reply Inline Actions Can't this just return `getBody()->without_terminator()` ? ftynse: Can't this just return `getBody()->without_terminator()` ?

	bool mlir::scf::insideMutuallyExclusiveBranches(Operation a, Operation b) {			bool mlir::scf::insideMutuallyExclusiveBranches(Operation a, Operation b) {
	assert(a && "expected non-empty operation");			assert(a && "expected non-empty operation");
	assert(b && "expected non-empty operation");			assert(b && "expected non-empty operation");

	IfOp ifOp = a->getParentOfType<IfOp>();			IfOp ifOp = a->getParentOfType<IfOp>();
	while (ifOp) {			while (ifOp) {
	// Check if b is inside ifOp. (We already know that a is.)			// Check if b is inside ifOp. (We already know that a is.)
	▲ Show 20 Lines • Show All 1,983 Lines • Show Last 20 Lines

mlir/test/Dialect/SCF/invalid.mlir

	Show First 20 Lines • Show All 514 Lines • ▼ Show 20 Lines
	func.func @execute_region() {			func.func @execute_region() {
	// expected-error @+1 {{region cannot have any arguments}}			// expected-error @+1 {{region cannot have any arguments}}
	"scf.execute_region"() ({			"scf.execute_region"() ({
	^bb0(%i : i32):			^bb0(%i : i32):
	scf.yield			scf.yield
	}) : () -> ()			}) : () -> ()
	return			return
	}			}

				// -----

				func.func @wrong_num_results(%in: tensor<100xf32>, %out: tensor<100xf32>) {
				%c1 = arith.constant 1 : index
				%num_threads = arith.constant 100 : index

				// expected-error @+1 {{produces 2 results, but its terminator yields 1 value(s)}}
				%result:2 = scf.foreach_thread (%thread_idx) in (%num_threads) -> (tensor<100xf32>, tensor<100xf32>) {
				%1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
				scf.foreach_thread.perform_concurrently {
				scf.foreach_thread.parallel_insert_slice %1 into %out[%thread_idx][1][1] :
				tensor<1xf32> into tensor<100xf32>
				}
				}
				return
				}

				// -----

				func.func @wrong_type_result(%in: tensor<100xf32>, %out: tensor<100xf32>) {
				%c1 = arith.constant 1 : index
				%num_threads = arith.constant 100 : index

				// expected-error @+1 {{type mismatch between result 0 ('tensor<?xf32>') and terminator ('tensor<100xf32>')}}
				%result = scf.foreach_thread (%thread_idx) in (%num_threads) -> (tensor<?xf32>) {
				%1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
				scf.foreach_thread.perform_concurrently {
				scf.foreach_thread.parallel_insert_slice %1 into %out[%thread_idx][1][1] :
				tensor<1xf32> into tensor<100xf32>
				}
				}
				return
				}

				// -----

				func.func @wrong_terminator_op(%in: tensor<100xf32>, %out: tensor<100xf32>) {
				%c1 = arith.constant 1 : index
				%num_threads = arith.constant 100 : index

				%result = scf.foreach_thread (%thread_idx) in (%num_threads) -> (tensor<100xf32>) {
				%1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
				// expected-error @+1 {{expected only scf.foreach_thread.parallel_insert_slice ops}}
				scf.foreach_thread.perform_concurrently {
				scf.foreach_thread.parallel_insert_slice %1 into %out[%thread_idx][1][1] :
				tensor<1xf32> into tensor<100xf32>
				%0 = arith.constant 1: index
				}
				}
				return
				}

mlir/test/Dialect/SCF/ops.mlir

Show First 20 Lines • Show All 304 Lines • ▼ Show 20 Lines	func.func @execute_region() -> i64 {
"scf.execute_region"() ({		"scf.execute_region"() ({
^bb0:		^bb0:
cf.br ^bb1		cf.br ^bb1
^bb1:		^bb1:
scf.yield		scf.yield
}) : () -> ()		}) : () -> ()
return %res : i64		return %res : i64
}		}

		// CHECK-LABEL: func.func @simple_example
		func.func @simple_example(%in: tensor<100xf32>, %out: tensor<100xf32>) {
		%c1 = arith.constant 1 : index
		%num_threads = arith.constant 100 : index

		// CHECK: scf.foreach_thread
		// CHECK-NEXT: tensor.extract_slice
		// CHECK-NEXT: scf.foreach_thread.perform_concurrently
		// CHECK-NEXT: scf.foreach_thread.parallel_insert_slice
		// CHECK-NEXT: }
		// CHECK-NEXT: }
		// CHECK-NEXT: return
		%result = scf.foreach_thread (%thread_idx) in (%num_threads) -> tensor<100xf32> {
		%1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
		scf.foreach_thread.perform_concurrently {
		scf.foreach_thread.parallel_insert_slice %1 into %out[%thread_idx][1][1] :
		tensor<1xf32> into tensor<100xf32>
		}
		}
		return
		}

		// CHECK-LABEL: func.func @elide_terminator
		func.func @elide_terminator() -> () {
		%num_threads = arith.constant 100 : index

		// CHECK: scf.foreach_thread
		// CHECK-NEXT: }
		// CHECK-NEXT: return
		scf.foreach_thread (%thread_idx) in (%num_threads) -> () {
		scf.foreach_thread.perform_concurrently {
		}
		}
		return
		}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][SCF] Add parallel abstraction on tensors.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 433327

mlir/include/mlir/Dialect/SCF/SCF.h

mlir/include/mlir/Dialect/SCF/SCFOps.td

mlir/lib/Dialect/SCF/SCF.cpp

mlir/test/Dialect/SCF/invalid.mlir

mlir/test/Dialect/SCF/ops.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][SCF] Add parallel abstraction on tensors.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 433327

mlir/include/mlir/Dialect/SCF/SCF.h

mlir/include/mlir/Dialect/SCF/SCFOps.td

mlir/lib/Dialect/SCF/SCF.cpp

mlir/test/Dialect/SCF/invalid.mlir

mlir/test/Dialect/SCF/ops.mlir

[mlir][SCF] Add parallel abstraction on tensors.
ClosedPublic