Diff 418520

flang/lib/Lower/OpenMP.cpp

Show First 20 Lines • Show All 199 Lines • ▼ Show 20 Lines	for (const auto &clause : parallelOpClauseList.v) {
numThreadsClauseOperand = fir::getBase(converter.genExprValue(		numThreadsClauseOperand = fir::getBase(converter.genExprValue(
*Fortran::semantics::GetExpr(numThreadsClause->v), stmtCtx));		*Fortran::semantics::GetExpr(numThreadsClause->v), stmtCtx));
}		}
// TODO: Handle private, firstprivate, shared and copyin		// TODO: Handle private, firstprivate, shared and copyin
}		}
// Create and insert the operation.		// Create and insert the operation.
auto parallelOp = firOpBuilder.create<mlir::omp::ParallelOp>(		auto parallelOp = firOpBuilder.create<mlir::omp::ParallelOp>(
currentLocation, argTy, ifClauseOperand, numThreadsClauseOperand,		currentLocation, argTy, ifClauseOperand, numThreadsClauseOperand,
ValueRange(), ValueRange(),		/allocate_vars=/ValueRange(), /allocators_vars=/ValueRange(),
		/reduction_vars=/ValueRange(), /reductions=/nullptr,
procBindClauseOperand.dyn_cast_or_null<omp::ClauseProcBindKindAttr>());		procBindClauseOperand.dyn_cast_or_null<omp::ClauseProcBindKindAttr>());
// Handle attribute based clauses.		// Handle attribute based clauses.
for (const auto &clause : parallelOpClauseList.v) {		for (const auto &clause : parallelOpClauseList.v) {
// TODO: Handle default clause		// TODO: Handle default clause
if (const auto &procBindClause =		if (const auto &procBindClause =
std::get_if<Fortran::parser::OmpClause::ProcBind>(&clause.u)) {		std::get_if<Fortran::parser::OmpClause::ProcBind>(&clause.u)) {
const auto &ompProcBindClause{procBindClause->v};		const auto &ompProcBindClause{procBindClause->v};
omp::ClauseProcBindKind pbKind;		omp::ClauseProcBindKind pbKind;
▲ Show 20 Lines • Show All 173 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td

Show First 20 Lines • Show All 60 Lines • ▼ Show 20 Lines

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// 2.6 parallel Construct		// 2.6 parallel Construct
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

def ParallelOp : OpenMP_Op<"parallel", [		def ParallelOp : OpenMP_Op<"parallel", [
AutomaticAllocationScope, AttrSizedOperandSegments,		AutomaticAllocationScope, AttrSizedOperandSegments,
DeclareOpInterfaceMethods<OutlineableOpenMPOpInterface>,		DeclareOpInterfaceMethods<OutlineableOpenMPOpInterface>,
RecursiveSideEffects]> {		RecursiveSideEffects, ReductionClauseInterface]> {
let summary = "parallel construct";		let summary = "parallel construct";
let description = [{		let description = [{
The parallel construct includes a region of code which is to be executed		The parallel construct includes a region of code which is to be executed
by a team of threads.		by a team of threads.

The optional $if_expr_var parameter specifies a boolean result of a		The optional $if_expr_var parameter specifies a boolean result of a
conditional check. If this value is 1 or is not provided then the parallel		conditional check. If this value is 1 or is not provided then the parallel
region runs as normal, if it is 0 then the parallel region is executed with		region runs as normal, if it is 0 then the parallel region is executed with
one thread.		one thread.

The optional $num_threads_var parameter specifies the number of threads which		The optional $num_threads_var parameter specifies the number of threads which
should be used to execute the parallel region.		should be used to execute the parallel region.

The $allocators_vars and $allocate_vars parameters are a variadic list of values		The $allocators_vars and $allocate_vars parameters are a variadic list of values
that specify the memory allocator to be used to obtain storage for private values.		that specify the memory allocator to be used to obtain storage for private values.

		Reductions can be performed in a parallel construct by specifying reduction
		accumulator variables in `reduction_vars` and symbols referring to reduction
		declarations in the `reductions` attribute. Each reduction is identified
		by the accumulator it uses and accumulators must not be repeated in the same
		reduction. The `omp.reduction` operation accepts the accumulator and a
		partial value which is considered to be produced by the thread for the
		given reduction. If multiple values are produced for the same accumulator,
		i.e. there are multiple `omp.reduction`s, the last value is taken. The
		reduction declaration specifies how to combine the values from each thread
		into the final value, which is available in the accumulator after all the
		threads complete.

The optional $proc_bind_val attribute controls the thread affinity for the execution		The optional $proc_bind_val attribute controls the thread affinity for the execution
of the parallel region.		of the parallel region.
}];		}];

let arguments = (ins Optional<AnyType>:$if_expr_var,		let arguments = (ins Optional<AnyType>:$if_expr_var,
Optional<AnyType>:$num_threads_var,		Optional<AnyType>:$num_threads_var,
Variadic<AnyType>:$allocate_vars,		Variadic<AnyType>:$allocate_vars,
Variadic<AnyType>:$allocators_vars,		Variadic<AnyType>:$allocators_vars,
		Variadic<OpenMP_PointerLikeType>:$reduction_vars,
		OptionalAttr<SymbolRefArrayAttr>:$reductions,
OptionalAttr<ProcBindKindAttr>:$proc_bind_val);		OptionalAttr<ProcBindKindAttr>:$proc_bind_val);

let regions = (region AnyRegion:$region);		let regions = (region AnyRegion:$region);

let builders = [		let builders = [
OpBuilder<(ins CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>		OpBuilder<(ins CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>
];		];
let assemblyFormat = [{		let assemblyFormat = [{
oilist( `if` `(` $if_expr_var `:` type($if_expr_var) `)`		oilist( `reduction` `(`
		custom<ReductionVarList>(
		$reduction_vars, type($reduction_vars), $reductions
		) `)`
		\| `if` `(` $if_expr_var `:` type($if_expr_var) `)`
\| `num_threads` `(` $num_threads_var `:` type($num_threads_var) `)`		\| `num_threads` `(` $num_threads_var `:` type($num_threads_var) `)`
\| `allocate` `(`		\| `allocate` `(`
custom<AllocateAndAllocator>(		custom<AllocateAndAllocator>(
$allocate_vars, type($allocate_vars),		$allocate_vars, type($allocate_vars),
$allocators_vars, type($allocators_vars)		$allocators_vars, type($allocators_vars)
) `)`		) `)`
\| `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`		\| `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
) $region attr-dict		) $region attr-dict
}];		}];
let hasVerifier = 1;		let hasVerifier = 1;
		let extraClassDeclaration = [{
		// TODO: remove this once emitAccessorPrefix is set to
		// kEmitAccessorPrefix_Prefixed for the dialect.
		/// Returns the reduction variables
		operand_range getReductionVars() { return reduction_vars(); }
		}];
}		}

def TerminatorOp : OpenMP_Op<"terminator", [Terminator]> {		def TerminatorOp : OpenMP_Op<"terminator", [Terminator]> {
let summary = "terminator for OpenMP regions";		let summary = "terminator for OpenMP regions";
let description = [{		let description = [{
A terminator operation for regions that appear in the body of OpenMP		A terminator operation for regions that appear in the body of OpenMP
operation. These regions are not expected to return any value so the		operation. These regions are not expected to return any value so the
terminator takes no operands. The terminator op returns control to the		terminator takes no operands. The terminator op returns control to the
Show All 30 Lines	let description = [{
A section operation encloses a region which represents one section in a		A section operation encloses a region which represents one section in a
sections construct. A section op should always be surrounded by an		sections construct. A section op should always be surrounded by an
`omp.sections` operation.		`omp.sections` operation.
}];		}];
let regions = (region AnyRegion:$region);		let regions = (region AnyRegion:$region);
let assemblyFormat = "$region attr-dict";		let assemblyFormat = "$region attr-dict";
}		}

def SectionsOp : OpenMP_Op<"sections", [AttrSizedOperandSegments]> {		def SectionsOp : OpenMP_Op<"sections", [AttrSizedOperandSegments,
		ReductionClauseInterface]> {
let summary = "sections construct";		let summary = "sections construct";
let description = [{		let description = [{
The sections construct is a non-iterative worksharing construct that		The sections construct is a non-iterative worksharing construct that
contains `omp.section` operations. The `omp.section` operations are to be		contains `omp.section` operations. The `omp.section` operations are to be
distributed among and executed by the threads in a team. Each `omp.section`		distributed among and executed by the threads in a team. Each `omp.section`
is executed once by one of the threads in the team in the context of its		is executed once by one of the threads in the team in the context of its
implicit task.		implicit task.

Show All 34 Lines	oilist( `reduction` `(`
$allocators_vars, type($allocators_vars)		$allocators_vars, type($allocators_vars)
) `)`		) `)`
\| `nowait` $nowait		\| `nowait` $nowait
) $region attr-dict		) $region attr-dict
}];		}];

let hasVerifier = 1;		let hasVerifier = 1;
let hasRegionVerifier = 1;		let hasRegionVerifier = 1;

		let extraClassDeclaration = [{
		// TODO: remove this once emitAccessorPrefix is set to
		// kEmitAccessorPrefix_Prefixed for the dialect.
		/// Returns the reduction variables
		operand_range getReductionVars() { return reduction_vars(); }
		}];
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// 2.8.2 Single Construct		// 2.8.2 Single Construct
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

def SingleOp : OpenMP_Op<"single", [AttrSizedOperandSegments]> {		def SingleOp : OpenMP_Op<"single", [AttrSizedOperandSegments]> {
let summary = "single directive";		let summary = "single directive";
Show All 24 Lines
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// 2.9.2 Workshare Loop Construct		// 2.9.2 Workshare Loop Construct
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,		def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
AllTypesMatch<["lowerBound", "upperBound", "step"]>,		AllTypesMatch<["lowerBound", "upperBound", "step"]>,
RecursiveSideEffects]> {		RecursiveSideEffects, ReductionClauseInterface]> {
let summary = "workshare loop construct";		let summary = "workshare loop construct";
let description = [{		let description = [{
The workshare loop construct specifies that the iterations of the loop(s)		The workshare loop construct specifies that the iterations of the loop(s)
will be executed in parallel by threads in the current context. These		will be executed in parallel by threads in the current context. These
iterations are spread across threads that already exist in the enclosing		iterations are spread across threads that already exist in the enclosing
parallel region. The lower and upper bounds specify a half-open range: the		parallel region. The lower and upper bounds specify a half-open range: the
range includes the lower bound but does not include the upper bound. If the		range includes the lower bound but does not include the upper bound. If the
`inclusive` attribute is specified then the upper bound is also included.		`inclusive` attribute is specified then the upper bound is also included.
▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
let regions = (region AnyRegion:$region);		let regions = (region AnyRegion:$region);

let extraClassDeclaration = [{		let extraClassDeclaration = [{
/// Returns the number of loops in the workshape loop nest.		/// Returns the number of loops in the workshape loop nest.
unsigned getNumLoops() { return lowerBound().size(); }		unsigned getNumLoops() { return lowerBound().size(); }

/// Returns the number of reduction variables.		/// Returns the number of reduction variables.
unsigned getNumReductionVars() { return reduction_vars().size(); }		unsigned getNumReductionVars() { return reduction_vars().size(); }

		// TODO: remove this once emitAccessorPrefix is set to
		// kEmitAccessorPrefix_Prefixed for the dialect.
		/// Returns the reduction variables
		operand_range getReductionVars() { return reduction_vars(); }
}];		}];
let hasCustomAssemblyFormat = 1;		let hasCustomAssemblyFormat = 1;
let assemblyFormat = [{		let assemblyFormat = [{
oilist(`linear` `(`		oilist(`linear` `(`
custom<LinearClause>($linear_vars, type($linear_vars),		custom<LinearClause>($linear_vars, type($linear_vars),
$linear_step_vars) `)`		$linear_step_vars) `)`
\|`schedule` `(`		\|`schedule` `(`
custom<ScheduleClause>(		custom<ScheduleClause>(
▲ Show 20 Lines • Show All 576 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td

Show All 25 Lines	def OutlineableOpenMPOpInterface : OpInterface<"OutlineableOpenMPOpInterface"> {
let methods = [		let methods = [
InterfaceMethod<"Get alloca block", "::mlir::Block*", "getAllocaBlock",		InterfaceMethod<"Get alloca block", "::mlir::Block*", "getAllocaBlock",
(ins), [{		(ins), [{
return &$_op.getRegion().front();		return &$_op.getRegion().front();
}]>,		}]>,
];		];
}		}

		def ReductionClauseInterface : OpInterface<"ReductionClauseInterface"> {
		let description = [{
		OpenMP operations that support reduction clause have this interface.
		}];

		let cppNamespace = "::mlir::omp";

		let methods = [
		InterfaceMethod<
		"Get reduction vars", "::mlir::Operation::operand_range",
		"getReductionVars">,
		];
		}

#endif // OpenMP_OPS_INTERFACES		#endif // OpenMP_OPS_INTERFACES

mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp

Show All 21 Lines
#include "llvm/ADT/StringExtras.h"		#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"		#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"		#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/TypeSwitch.h"		#include "llvm/ADT/TypeSwitch.h"
#include <cstddef>		#include <cstddef>

#include "mlir/Dialect/OpenMP/OpenMPOpsDialect.cpp.inc"		#include "mlir/Dialect/OpenMP/OpenMPOpsDialect.cpp.inc"
#include "mlir/Dialect/OpenMP/OpenMPOpsEnums.cpp.inc"		#include "mlir/Dialect/OpenMP/OpenMPOpsEnums.cpp.inc"
		#include "mlir/Dialect/OpenMP/OpenMPOpsInterfaces.cpp.inc"
#include "mlir/Dialect/OpenMP/OpenMPTypeInterfaces.cpp.inc"		#include "mlir/Dialect/OpenMP/OpenMPTypeInterfaces.cpp.inc"

using namespace mlir;		using namespace mlir;
using namespace mlir::omp;		using namespace mlir::omp;

namespace {		namespace {
/// Model for pointer-like types that already provide a `getElementType` method.		/// Model for pointer-like types that already provide a `getElementType` method.
template <typename T>		template <typename T>
Show All 16 Lines	#include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc"
>();		>();

LLVM::LLVMPointerType::attachInterface<		LLVM::LLVMPointerType::attachInterface<
PointerLikeModel<LLVM::LLVMPointerType>>(*getContext());		PointerLikeModel<LLVM::LLVMPointerType>>(*getContext());
MemRefType::attachInterface<PointerLikeModel<MemRefType>>(*getContext());		MemRefType::attachInterface<PointerLikeModel<MemRefType>>(*getContext());
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// ParallelOp
//===----------------------------------------------------------------------===//

void ParallelOp::build(OpBuilder &builder, OperationState &state,
ArrayRef<NamedAttribute> attributes) {
ParallelOp::build(
builder, state, /if_expr_var=/nullptr, /num_threads_var=/nullptr,
/allocate_vars=/ValueRange(), /allocators_vars=/ValueRange(),
/proc_bind_val=/nullptr);
state.addAttributes(attributes);
}

//===----------------------------------------------------------------------===//
// Parser and printer for Allocate Clause		// Parser and printer for Allocate Clause
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

/// Parse an allocate clause with allocators and a list of operands with types.		/// Parse an allocate clause with allocators and a list of operands with types.
///		///
/// allocate-operand-list :: = allocate-operand \|		/// allocate-operand-list :: = allocate-operand \|
/// allocator-operand `,` allocate-operand-list		/// allocator-operand `,` allocate-operand-list
/// allocate-operand :: = ssa-id-and-type -> ssa-id-and-type		/// allocate-operand :: = ssa-id-and-type -> ssa-id-and-type
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	static ParseResult parseClauseAttr(AsmParser &parser, ClauseAttr &attr) {
return parser.emitError(loc, "invalid clause value: '") << enumStr << "'";		return parser.emitError(loc, "invalid clause value: '") << enumStr << "'";
}		}

template <typename ClauseAttr>		template <typename ClauseAttr>
void printClauseAttr(OpAsmPrinter &p, Operation *op, ClauseAttr attr) {		void printClauseAttr(OpAsmPrinter &p, Operation *op, ClauseAttr attr) {
p << stringifyEnum(attr.getValue());		p << stringifyEnum(attr.getValue());
}		}

LogicalResult ParallelOp::verify() {
if (allocate_vars().size() != allocators_vars().size())
return emitError(
"expected equal sizes for allocate and allocator variables");
return success();
}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Parser and printer for Linear Clause		// Parser and printer for Linear Clause
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

/// linear ::= `linear` `(` linear-list `)`		/// linear ::= `linear` `(` linear-list `)`
/// linear-list := linear-val \| linear-val linear-list		/// linear-list := linear-val \| linear-val linear-list
/// linear-val := ssa-id-and-type `=` ssa-id-and-type		/// linear-val := ssa-id-and-type `=` ssa-id-and-type
static ParseResult		static ParseResult
▲ Show 20 Lines • Show All 305 Lines • ▼ Show 20 Lines	return op->emitOpError() << "the hints omp_sync_hint_uncontended and "
"omp_sync_hint_contended cannot be combined";		"omp_sync_hint_contended cannot be combined";
if (nonspeculative && speculative)		if (nonspeculative && speculative)
return op->emitOpError() << "the hints omp_sync_hint_nonspeculative and "		return op->emitOpError() << "the hints omp_sync_hint_nonspeculative and "
"omp_sync_hint_speculative cannot be combined.";		"omp_sync_hint_speculative cannot be combined.";
return success();		return success();
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
		// ParallelOp
		//===----------------------------------------------------------------------===//

		void ParallelOp::build(OpBuilder &builder, OperationState &state,
		ArrayRef<NamedAttribute> attributes) {
		ParallelOp::build(
		builder, state, /if_expr_var=/nullptr, /num_threads_var=/nullptr,
		/allocate_vars=/ValueRange(), /allocators_vars=/ValueRange(),
		/reduction_vars=/ValueRange(), /reductions=/nullptr,
		/proc_bind_val=/nullptr);
		state.addAttributes(attributes);
		}

		LogicalResult ParallelOp::verify() {
		if (allocate_vars().size() != allocators_vars().size())
		return emitError(
		"expected equal sizes for allocate and allocator variables");
		return verifyReductionVarList(*this, reductions(), reduction_vars());
		}

		//===----------------------------------------------------------------------===//
// Verifier for SectionsOp		// Verifier for SectionsOp
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

LogicalResult SectionsOp::verify() {		LogicalResult SectionsOp::verify() {
if (allocate_vars().size() != allocators_vars().size())		if (allocate_vars().size() != allocators_vars().size())
return emitError(		return emitError(
"expected equal sizes for allocate and allocator variables");		"expected equal sizes for allocate and allocator variables");

▲ Show 20 Lines • Show All 222 Lines • ▼ Show 20 Lines	LogicalResult ReductionDeclareOp::verifyRegions() {
auto ptrType = atomicReductionEntryBlock.getArgumentTypes()[0]		auto ptrType = atomicReductionEntryBlock.getArgumentTypes()[0]
.dyn_cast<PointerLikeType>();		.dyn_cast<PointerLikeType>();
if (!ptrType \|\| ptrType.getElementType() != type())		if (!ptrType \|\| ptrType.getElementType() != type())
return emitOpError() << "expects atomic reduction region arguments to "		return emitOpError() << "expects atomic reduction region arguments to "
"be accumulators containing the reduction type";		"be accumulators containing the reduction type";
return success();		return success();
}		}

LogicalResult ReductionOp::verify() {		LogicalResult ReductionOp::verify() {
// TODO: generalize this to an op interface when there is more than one op		auto op = (this)->getParentWithTrait<ReductionClauseInterface::Trait>();
		kiranchandramohanUnsubmitted Done Reply Inline Actions Will this work correctly if there are nested parallel regions but the reduction is on the outer region? kiranchandramohan: Will this work correctly if there are nested parallel regions but the reduction is on the outer…
		shraiyshAuthorUnsubmitted Done Reply Inline Actions That is an interesting question. I just translated the functionality from wsloop to interface. I could not find clear documentation of this being an error. I tried running a nested parallel region in C without reduction clause on the inner construct and it results in a race condition - #pragma omp parallel reduction(+:i) { #pragma omp parallel { for(int j = 0; j < 10000; j++) i += 1; } } This means that the instruction `i+=1` is treated as a normal operation and not an "omp.reduction" operation. So, I think to have similar behavior in Fortran, PFT to MLIR should generate normal operation for `i+=1` and not an "omp.reduction" operation. Based on that interpretation, omp.reduction must always be enclosed with an operation with `ReductionClauseInterface`. Let me know if something seems incorrect. shraiysh: That is an interesting question. I just translated the functionality from wsloop to interface.
		shraiyshAuthorUnsubmitted Done Reply Inline Actions omp.reduction must always be enclosed with an operation with `ReductionClauseInterface`. more precise: omp.reduction must always be enclosed with an operation with ReductionClauseInterface where the accumulator is a part of the innermost such operation's reduction clause. shraiysh: > omp.reduction must always be enclosed with an operation with `ReductionClauseInterface`. more…
		kiranchandramohanUnsubmitted Done Reply Inline Actions OK. I am not sure whether the check is stronger than what the standard suggests. Compilers seem to be OK with it. I haven't looked into the standard in detail but did not find anything on a quick glance. The following test seemed to work fine. int i=0 #pragma omp parallel reduction(+:i) #pragma omp for for(int j = 0; j < 10; j++) i += 1; kiranchandramohan: OK. I am not sure whether the check is stronger than what the standard suggests. Compilers seem…
		shraiyshAuthorUnsubmitted Done Reply Inline Actions Yes, I am not suggesting that this is an error. I am saying that in the code above, `i+=1` will not be executed atomically (atleast that's what is happening with clang and gcc for C). Thus, it is not handling it as a "reduction". This means that while lowering this code, `i+=1` was lowered like normal code (without any reduction specific atomic handling). So, the FIR for this following clang semantics for nested constructs will be something like the following - omp.parallel reduction( ... ){ omp.wsloop for (...) { // omp.reduction here will be wrong - because that means atomic reduction. %1 = fir.load %i %2 = arith.addi %i, i32 1 fir.store %2, %i } } It is not an error for the frontend - the IR for reduction clause on the internal construct is diff. It is the job of frontend to lower only reductions in the immediate scope as `omp.reduction` and the nested ones as normal fir operations. Let me know if that is not clear. shraiysh: Yes, I am not suggesting that this is an error. I am saying that in the code above, `i+=1` will…
		kiranchandramohanUnsubmitted Done Reply Inline Actions Yes, I am not suggesting that this is an error. I am saying that in the code above, i+=1 will not be executed atomically (atleast that's what is happening with clang and gcc for C). Thus, it is not handling it as a "reduction". The IR i saw seemed to be similar to the case where the reduction is in the `#pragma omp for`. Intuitively the reduction on parallel would say that there are private copies for each thread in the parallel region, In each of these threads the addition will have happen sequentially, and across threads finally at the end of the region the results will be accumulated atomically. So it seems to be a legitimate case. Are you suggesting that while this is not an error, we do not have a way to represent this reduction in the current state of reduction handling in the dialect? kiranchandramohan: > Yes, I am not suggesting that this is an error. I am saying that in the code above, i+=1 will…
		shraiyshAuthorUnsubmitted Done Reply Inline Actions Are you suggesting that while this is not an error, we do not have a way to represent this reduction in the current state of reduction handling in the dialect? No, we do have a way to represent it. I will try to elaborate. There are three cases: Reduction on both constructs omp_set_max_active_levels(2); #pragma omp parallel reduction(+:i) { #pragma omp parallel for reduction(+:i) for(int j = 0; j < 100000; j++) i += 1; } printf("i = %d\n", i); On a 32 core machine, this makes the value of `i=3200000` as expected. Reduction on outer loop omp_set_max_active_levels(2); #pragma omp parallel reduction(+:i) { #pragma omp parallel for for(int j = 0; j < 100000; j++) i += 1; } printf("i = %d\n", i); This will always give different answers. We have 32x32 threads executing the `i+=1` statement. The answers are different because while the outer construct makes sure that all 32 threads because of the outer construct mutate `i` atomically, the threads spawned by the inner construct make no such guarantee and hence the concurrent edits. Reduction on inner loop omp_set_max_active_levels(2); #pragma omp parallel { #pragma omp parallel for reduction(+:i) for(int j = 0; j < 100000; j++) i += 1; } printf("i = %d\n", i); Again, this gives different answers on each run. This is very similar to the second case. Again we have 32x32 threads here. Here, while all the threads generated by inner construct are atomic wrt each other (all sets of 32 child threads are atomic internally) they are not globally atomic. For ease of understanding, lets assume T(m, n) is the instance of the statement i+=1 executed by the n-th inner construct thread which is the child of m-th outer construct thread. With this notation, T(1,1), T(1,2), ... T(1,32) are atomic but they are not atomic with T(2,1) and hence the concurrent edits. The IR i saw seemed to be similar to the case where the reduction is in the #pragma omp for. Intuitively the reduction on parallel would say that there are private copies for each thread in the parallel region, In each of these threads the addition will have happen sequentially, and across threads finally at the end of the region the results will be accumulated atomically. So it seems to be a legitimate case. Yes, it is a legitimate case, and the IRs are similar but as I pointed out above, they are both not seeing this as a "reduction". Only in the first case (reduction clause on both) treats this as a proper reduction. With the current implementation of omp.reduction, we can have the same semantics as gcc/clang. (I am assuming gcc and clang behavior to be accurate about nesting as the standard doesn't mention it). We can connect over slack to figure this out if you'd like to. Apologies for the lengthy explanation. Please let me know if anything seems incorrect. shraiysh: > Are you suggesting that while this is not an error, we do not have a way to represent this…
		kiranchandramohanUnsubmitted Done Reply Inline Actions @shraiysh I agree with the nested parallel case because there are two sets of threads. But I was pointing to the case where there is an outer parallel and an inner work-sharing loop. omp.parallel { omp.wsloop { } } For this case, it seems the reduction can appear on either of the operations and it should not be rejected by the verifier. kiranchandramohan: @shraiysh I agree with the nested parallel case because there are two sets of threads. But I…
		shraiyshAuthorUnsubmitted Done Reply Inline Actions Hmm okay, now I understand. I had not tried with just worksharing construct. In that case, I will relax this requirement to cover all levels of parents. Thanks for the patience and the discussion! shraiysh: Hmm okay, now I understand. I had not tried with just worksharing construct. In that case, I…
// that supports reductions.		if (!op)
auto container = (*this)->getParentOfType<WsLoopOp>();		return emitOpError() << "must be used within an operation supporting "
for (unsigned i = 0, e = container.getNumReductionVars(); i < e; ++i)		"reduction clause interface";
if (container.reduction_vars()[i] == accumulator())		while (op) {
		for (const auto &var :
		cast<ReductionClauseInterface>(op).getReductionVars())
		if (var == accumulator())
return success();		return success();
		op = op->getParentWithTrait<ReductionClauseInterface::Trait>();
		}
return emitOpError() << "the accumulator is not used by the parent";		return emitOpError() << "the accumulator is not used by the parent";
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// WsLoopOp		// WsLoopOp
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

void WsLoopOp::build(OpBuilder &builder, OperationState &state,		void WsLoopOp::build(OpBuilder &builder, OperationState &state,
▲ Show 20 Lines • Show All 221 Lines • Show Last 20 Lines

mlir/test/Dialect/OpenMP/ops.mlir

	Show First 20 Lines • Show All 53 Lines • ▼ Show 20 Lines
	func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : si32) -> () {			func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : si32) -> () {
	// CHECK: omp.parallel if(%{{.}}) num_threads(%{{.}} : si32) allocate(%{{.}} : memref<i32> -> %{{.}} : memref<i32>)			// CHECK: omp.parallel if(%{{.}}) num_threads(%{{.}} : si32) allocate(%{{.}} : memref<i32> -> %{{.}} : memref<i32>)
	"omp.parallel" (%if_cond, %num_threads, %data_var, %data_var) ({			"omp.parallel" (%if_cond, %num_threads, %data_var, %data_var) ({

	// test without if condition			// test without if condition
	// CHECK: omp.parallel num_threads(%{{.}} : si32) allocate(%{{.}} : memref<i32> -> %{{.*}} : memref<i32>)			// CHECK: omp.parallel num_threads(%{{.}} : si32) allocate(%{{.}} : memref<i32> -> %{{.*}} : memref<i32>)
	"omp.parallel"(%num_threads, %data_var, %data_var) ({			"omp.parallel"(%num_threads, %data_var, %data_var) ({
	omp.terminator			omp.terminator
	}) {operand_segment_sizes = dense<[0,1,1,1]>: vector<4xi32>} : (si32, memref<i32>, memref<i32>) -> ()			}) {operand_segment_sizes = dense<[0,1,1,1,0]> : vector<5xi32>} : (si32, memref<i32>, memref<i32>) -> ()

	// CHECK: omp.barrier			// CHECK: omp.barrier
	omp.barrier			omp.barrier

	// test without num_threads			// test without num_threads
	// CHECK: omp.parallel if(%{{.}}) allocate(%{{.}} : memref<i32> -> %{{.*}} : memref<i32>)			// CHECK: omp.parallel if(%{{.}}) allocate(%{{.}} : memref<i32> -> %{{.*}} : memref<i32>)
	"omp.parallel"(%if_cond, %data_var, %data_var) ({			"omp.parallel"(%if_cond, %data_var, %data_var) ({
	omp.terminator			omp.terminator
	}) {operand_segment_sizes = dense<[1,0,1,1]> : vector<4xi32>} : (i1, memref<i32>, memref<i32>) -> ()			}) {operand_segment_sizes = dense<[1,0,1,1,0]> : vector<5xi32>} : (i1, memref<i32>, memref<i32>) -> ()

	// test without allocate			// test without allocate
	// CHECK: omp.parallel if(%{{.}}) num_threads(%{{.}} : si32)			// CHECK: omp.parallel if(%{{.}}) num_threads(%{{.}} : si32)
	"omp.parallel"(%if_cond, %num_threads) ({			"omp.parallel"(%if_cond, %num_threads) ({
	omp.terminator			omp.terminator
	}) {operand_segment_sizes = dense<[1,1,0,0]> : vector<4xi32>} : (i1, si32) -> ()			}) {operand_segment_sizes = dense<[1,1,0,0,0]> : vector<5xi32>} : (i1, si32) -> ()

	omp.terminator			omp.terminator
	}) {operand_segment_sizes = dense<[1,1,1,1]> : vector<4xi32>, proc_bind_val = #omp<"procbindkind spread">} : (i1, si32, memref<i32>, memref<i32>) -> ()			}) {operand_segment_sizes = dense<[1,1,1,1,0]> : vector<5xi32>, proc_bind_val = #omp<"procbindkind spread">} : (i1, si32, memref<i32>, memref<i32>) -> ()

	// test with multiple parameters for single variadic argument			// test with multiple parameters for single variadic argument
	// CHECK: omp.parallel allocate(%{{.}} : memref<i32> -> %{{.}} : memref<i32>)			// CHECK: omp.parallel allocate(%{{.}} : memref<i32> -> %{{.}} : memref<i32>)
	"omp.parallel" (%data_var, %data_var) ({			"omp.parallel" (%data_var, %data_var) ({
	omp.terminator			omp.terminator
	}) {operand_segment_sizes = dense<[0,0,1,1]> : vector<4xi32>} : (memref<i32>, memref<i32>) -> ()			}) {operand_segment_sizes = dense<[0,0,1,1,0]> : vector<5xi32>} : (memref<i32>, memref<i32>) -> ()

	return			return
	}			}

	func @omp_parallel_pretty(%data_var : memref<i32>, %if_cond : i1, %num_threads : si32, %allocator : si32) -> () {			func @omp_parallel_pretty(%data_var : memref<i32>, %if_cond : i1, %num_threads : si32, %allocator : si32) -> () {
	// CHECK: omp.parallel			// CHECK: omp.parallel
	omp.parallel {			omp.parallel {
	omp.terminator			omp.terminator
	▲ Show 20 Lines • Show All 307 Lines • ▼ Show 20 Lines
	}			}
	atomic {			atomic {
	^bb2(%arg2: !llvm.ptr<f32>, %arg3: !llvm.ptr<f32>):			^bb2(%arg2: !llvm.ptr<f32>, %arg3: !llvm.ptr<f32>):
	%2 = llvm.load %arg3 : !llvm.ptr<f32>			%2 = llvm.load %arg3 : !llvm.ptr<f32>
	llvm.atomicrmw fadd %arg2, %2 monotonic : f32			llvm.atomicrmw fadd %arg2, %2 monotonic : f32
	omp.yield			omp.yield
	}			}

	func @reduction(%lb : index, %ub : index, %step : index) {			// CHECK-LABEL: func @wsloop_reduction
				func @wsloop_reduction(%lb : index, %ub : index, %step : index) {
	%c1 = arith.constant 1 : i32			%c1 = arith.constant 1 : i32
	%0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr<f32>			%0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr<f32>
	// CHECK: reduction(@add_f32 -> %{{.+}} : !llvm.ptr<f32>)			// CHECK: reduction(@add_f32 -> %{{.+}} : !llvm.ptr<f32>)
	omp.wsloop reduction(@add_f32 -> %0 : !llvm.ptr<f32>)			omp.wsloop reduction(@add_f32 -> %0 : !llvm.ptr<f32>)
	for (%iv) : index = (%lb) to (%ub) step (%step) {			for (%iv) : index = (%lb) to (%ub) step (%step) {
	%1 = arith.constant 2.0 : f32			%1 = arith.constant 2.0 : f32
	// CHECK: omp.reduction %{{.+}}, %{{.+}}			// CHECK: omp.reduction %{{.+}}, %{{.+}}
	omp.reduction %1, %0 : !llvm.ptr<f32>			omp.reduction %1, %0 : !llvm.ptr<f32>
	omp.yield			omp.yield
	}			}
	return			return
	}			}

				// CHECK-LABEL: func @parallel_reduction
				func @parallel_reduction() {
				%c1 = arith.constant 1 : i32
				%0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr<f32>
				// CHECK: omp.parallel reduction(@add_f32 -> {{.+}} : !llvm.ptr<f32>)
				omp.parallel reduction(@add_f32 -> %0 : !llvm.ptr<f32>) {
				%1 = arith.constant 2.0 : f32
				// CHECK: omp.reduction %{{.+}}, %{{.+}}
				omp.reduction %1, %0 : !llvm.ptr<f32>
				omp.terminator
				}
				return
				}

				// CHECK: func @parallel_wsloop_reduction
				func @parallel_wsloop_reduction(%lb : index, %ub : index, %step : index) {
				%c1 = arith.constant 1 : i32
				%0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr<f32>
				// CHECK: omp.parallel reduction(@add_f32 -> %{{.+}} : !llvm.ptr<f32>) {
				omp.parallel reduction(@add_f32 -> %0 : !llvm.ptr<f32>) {
				// CHECK: omp.wsloop for (%{{.+}}) : index = (%{{.+}}) to (%{{.+}}) step (%{{.+}})
				omp.wsloop for (%iv) : index = (%lb) to (%ub) step (%step) {
				%1 = arith.constant 2.0 : f32
				// CHECK: omp.reduction %{{.+}}, %{{.+}} : !llvm.ptr<f32>
				omp.reduction %1, %0 : !llvm.ptr<f32>
				// CHECK: omp.yield
				omp.yield
				}
				// CHECK: omp.terminator
				omp.terminator
				}
				return
				}

				// CHECK-LABEL: func @sections_reduction
				func @sections_reduction() {
				%c1 = arith.constant 1 : i32
				%0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr<f32>
				// CHECK: omp.sections reduction(@add_f32 -> {{.+}} : !llvm.ptr<f32>)
				omp.sections reduction(@add_f32 -> %0 : !llvm.ptr<f32>) {
				// CHECK: omp.section
				omp.section {
				%1 = arith.constant 2.0 : f32
				// CHECK: omp.reduction %{{.+}}, %{{.+}}
				omp.reduction %1, %0 : !llvm.ptr<f32>
				omp.terminator
				}
				// CHECK: omp.section
				omp.section {
				%1 = arith.constant 3.0 : f32
				// CHECK: omp.reduction %{{.+}}, %{{.+}}
				omp.reduction %1, %0 : !llvm.ptr<f32>
				omp.terminator
				}
				omp.terminator
				}
				return
				}

	// CHECK: omp.reduction.declare			// CHECK: omp.reduction.declare
	// CHECK-LABEL: @add2_f32			// CHECK-LABEL: @add2_f32
	omp.reduction.declare @add2_f32 : f32			omp.reduction.declare @add2_f32 : f32
	// CHECK: init			// CHECK: init
	init {			init {
	^bb0(%arg: f32):			^bb0(%arg: f32):
	%0 = arith.constant 0.0 : f32			%0 = arith.constant 0.0 : f32
	omp.yield (%0 : f32)			omp.yield (%0 : f32)
	}			}
	// CHECK: combiner			// CHECK: combiner
	combiner {			combiner {
	^bb1(%arg0: f32, %arg1: f32):			^bb1(%arg0: f32, %arg1: f32):
	%1 = arith.addf %arg0, %arg1 : f32			%1 = arith.addf %arg0, %arg1 : f32
	omp.yield (%1 : f32)			omp.yield (%1 : f32)
	}			}
	// CHECK-NOT: atomic			// CHECK-NOT: atomic

	func @reduction2(%lb : index, %ub : index, %step : index) {			// CHECK-LABEL: func @wsloop_reduction2
				func @wsloop_reduction2(%lb : index, %ub : index, %step : index) {
	%0 = memref.alloca() : memref<1xf32>			%0 = memref.alloca() : memref<1xf32>
	// CHECK: reduction			// CHECK: omp.wsloop reduction(@add2_f32 -> %{{.+}} : memref<1xf32>)
	omp.wsloop reduction(@add2_f32 -> %0 : memref<1xf32>)			omp.wsloop reduction(@add2_f32 -> %0 : memref<1xf32>)
	for (%iv) : index = (%lb) to (%ub) step (%step) {			for (%iv) : index = (%lb) to (%ub) step (%step) {
	%1 = arith.constant 2.0 : f32			%1 = arith.constant 2.0 : f32
	// CHECK: omp.reduction			// CHECK: omp.reduction
	omp.reduction %1, %0 : memref<1xf32>			omp.reduction %1, %0 : memref<1xf32>
	omp.yield			omp.yield
	}			}
	return			return
	}			}

				// CHECK-LABEL: func @parallel_reduction2
				func @parallel_reduction2() {
				%0 = memref.alloca() : memref<1xf32>
				// CHECK: omp.parallel reduction(@add2_f32 -> %{{.+}} : memref<1xf32>)
				omp.parallel reduction(@add2_f32 -> %0 : memref<1xf32>) {
				%1 = arith.constant 2.0 : f32
				// CHECK: omp.reduction
				omp.reduction %1, %0 : memref<1xf32>
				omp.terminator
				}
				return
				}

				// CHECK: func @parallel_wsloop_reduction2
				func @parallel_wsloop_reduction2(%lb : index, %ub : index, %step : index) {
				%c1 = arith.constant 1 : i32
				%0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr<f32>
				// CHECK: omp.parallel reduction(@add2_f32 -> %{{.+}} : !llvm.ptr<f32>) {
				omp.parallel reduction(@add2_f32 -> %0 : !llvm.ptr<f32>) {
				// CHECK: omp.wsloop for (%{{.+}}) : index = (%{{.+}}) to (%{{.+}}) step (%{{.+}})
				omp.wsloop for (%iv) : index = (%lb) to (%ub) step (%step) {
				%1 = arith.constant 2.0 : f32
				// CHECK: omp.reduction %{{.+}}, %{{.+}} : !llvm.ptr<f32>
				omp.reduction %1, %0 : !llvm.ptr<f32>
				// CHECK: omp.yield
				omp.yield
				}
				// CHECK: omp.terminator
				omp.terminator
				}
				return
				}

				// CHECK-LABEL: func @sections_reduction2
				func @sections_reduction2() {
				%0 = memref.alloca() : memref<1xf32>
				// CHECK: omp.sections reduction(@add2_f32 -> %{{.+}} : memref<1xf32>)
				omp.sections reduction(@add2_f32 -> %0 : memref<1xf32>) {
				omp.section {
				%1 = arith.constant 2.0 : f32
				// CHECK: omp.reduction
				omp.reduction %1, %0 : memref<1xf32>
				omp.terminator
				}
				omp.section {
				%1 = arith.constant 2.0 : f32
				// CHECK: omp.reduction
				omp.reduction %1, %0 : memref<1xf32>
				omp.terminator
				}
				omp.terminator
				}
				return
				}

	// CHECK: omp.critical.declare @mutex1 hint(uncontended)			// CHECK: omp.critical.declare @mutex1 hint(uncontended)
	omp.critical.declare @mutex1 hint(uncontended)			omp.critical.declare @mutex1 hint(uncontended)
	// CHECK: omp.critical.declare @mutex2 hint(contended)			// CHECK: omp.critical.declare @mutex2 hint(contended)
	omp.critical.declare @mutex2 hint(contended)			omp.critical.declare @mutex2 hint(contended)
	// CHECK: omp.critical.declare @mutex3 hint(nonspeculative)			// CHECK: omp.critical.declare @mutex3 hint(nonspeculative)
	omp.critical.declare @mutex3 hint(nonspeculative)			omp.critical.declare @mutex3 hint(nonspeculative)
	// CHECK: omp.critical.declare @mutex4 hint(speculative)			// CHECK: omp.critical.declare @mutex4 hint(speculative)
	omp.critical.declare @mutex4 hint(speculative)			omp.critical.declare @mutex4 hint(speculative)
	▲ Show 20 Lines • Show All 321 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][OpenMP] Added ReductionClauseInterface
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 418520

flang/lib/Lower/OpenMP.cpp

mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td

mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td

mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp

mlir/test/Dialect/OpenMP/ops.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][OpenMP] Added ReductionClauseInterfaceClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 418520

flang/lib/Lower/OpenMP.cpp

mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td

mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td

mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp

mlir/test/Dialect/OpenMP/ops.mlir

[mlir][OpenMP] Added ReductionClauseInterface
ClosedPublic