This is an archive of the discontinued LLVM Phabricator instance.

elmcdonough retitled this revision from [openmp] Parallel reduction LLVM IR generation to [openmp][mlir] Parallel reduction LLVM IR generation.Jul 12 2023, 11:35 PM

Harbormaster completed remote builds in B244999: Diff 539861.Jul 13 2023, 12:32 AM

Thanks @elmcdonough for the patch.

The reduction for the parallel construct should happen inside the parallel region (outlined function). At the moment, the code generated does this outside it.

Could you also check the case where the reduction happens in a nested region?

      subroutine diff(nelt)
      implicit none

      integer :: ie, nelt, rho1

      r1 = 0
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(ie) REDUCTION(+:r1)
!$OMP DO
       do ie=1,nelt
               r1            = r1 + ie
       end do
!$OMP END DO
!$OMP END PARALLEL

      end

This revision now requires changes to proceed.Jul 13 2023, 6:32 AM

Ensure that reduction happens in outlined function + account for nested WsLoopOp.

Harbormaster completed remote builds in B249603: Diff 546229.Aug 1 2023, 4:03 PM

LG.

Could you check that the tests in gfortran testuite pass for parallel reduction before submitting?
https://github.com/llvm/llvm-test-suite/blob/745f3fdfe3f2f28dcea34e3e55fdc55aca06c00d/Fortran/gfortran/regression/gomp/DisabledFiles.cmake#L111

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
355–368	Nit: Would this change work?

This revision is now accepted and ready to land.Aug 3 2023, 6:26 AM

elmcdonough updated this revision to Diff 550501.Aug 15 2023, 3:16 PM

Harbormaster completed remote builds in B252768: Diff 550501.Aug 15 2023, 4:21 PM

This revision was landed with ongoing or failed builds.Aug 15 2023, 10:59 PM

Closed by commit rGde7224399acd: [openmp][mlir] Parallel reduction LLVM IR generation (authored by elmcdonough). · Explain Why

This revision was automatically updated to reflect the committed changes.

elmcdonough added a commit: rGde7224399acd: [openmp][mlir] Parallel reduction LLVM IR generation.

elmcdonough mentioned this in D158054: [Fortran/gfortran][OpenMP] Enable parallel reduction tests.Aug 15 2023, 11:43 PM

elmcdonough mentioned this in rTdf04fba12583: [Fortran/gfortran][OpenMP] Enable parallel reduction tests.Aug 16 2023, 9:11 AM

Revision Contents

Path

Size

mlir/

include/

mlir/

Dialect/

OpenMP/

OpenMPOps.td

4 lines

lib/

Target/

LLVMIR/

Dialect/

OpenMP/

OpenMPToLLVMIRTranslation.cpp

348 lines

test/

Target/

LLVMIR/

openmp-reduction.mlir

143 lines

Diff 550619

mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td

Show First 20 Lines • Show All 182 Lines • ▼ Show 20 Lines	let arguments = (ins Optional<I1>:$if_expr_var,
OptionalAttr<SymbolRefArrayAttr>:$reductions,		OptionalAttr<SymbolRefArrayAttr>:$reductions,
OptionalAttr<ProcBindKindAttr>:$proc_bind_val);		OptionalAttr<ProcBindKindAttr>:$proc_bind_val);

let regions = (region AnyRegion:$region);		let regions = (region AnyRegion:$region);

let builders = [		let builders = [
OpBuilder<(ins CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>		OpBuilder<(ins CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>
];		];
		let extraClassDeclaration = [{
		/// Returns the number of reduction variables.
		unsigned getNumReductionVars() { return getReductionVars().size(); }
		}];
let assemblyFormat = [{		let assemblyFormat = [{
oilist( `reduction` `(`		oilist( `reduction` `(`
custom<ReductionVarList>(		custom<ReductionVarList>(
$reduction_vars, type($reduction_vars), $reductions		$reduction_vars, type($reduction_vars), $reductions
) `)`		) `)`
\| `if` `(` $if_expr_var `:` type($if_expr_var) `)`		\| `if` `(` $if_expr_var `:` type($if_expr_var) `)`
\| `num_threads` `(` $num_threads_var `:` type($num_threads_var) `)`		\| `num_threads` `(` $num_threads_var `:` type($num_threads_var) `)`
\| `allocate` `(`		\| `allocate` `(`
▲ Show 20 Lines • Show All 1,589 Lines • Show Last 20 Lines

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Show First 20 Lines • Show All 247 Lines • ▼ Show 20 Lines

static llvm::omp::ProcBindKind getProcBindKind(omp::ClauseProcBindKind kind) {

case omp::ClauseProcBindKind::Primary:

return llvm::omp::ProcBindKind::OMP_PROC_BIND_primary;

case omp::ClauseProcBindKind::Spread:

return llvm::omp::ProcBindKind::OMP_PROC_BIND_spread;

}

llvm_unreachable("Unknown ClauseProcBindKind kind");

}

/// Converts the OpenMP parallel operation to LLVM IR.

static LogicalResult

convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,

LLVM::ModuleTranslation &moduleTranslation) {

using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;

// TODO: support error propagation in OpenMPIRBuilder and use it instead of

// relying on captured variables.

LogicalResult bodyGenStatus = success();

auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {

// Save the alloca insertion point on ModuleTranslation stack for use in

// nested regions.

LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(

moduleTranslation, allocaIP);

// ParallelOp has only one region associated with it.

builder.restoreIP(codeGenIP);

convertOmpOpRegions(opInst.getRegion(), "omp.par.region", builder,

moduleTranslation, bodyGenStatus);

};

// TODO: Perform appropriate actions according to the data-sharing

// attribute (shared, private, firstprivate, ...) of variables.

// Currently defaults to shared.

auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,

llvm::Value &, llvm::Value &vPtr,

llvm::Value *&replacementValue) -> InsertPointTy {

replacementValue = &vPtr;

return codeGenIP;

};

// TODO: Perform finalization actions for variables. This has to be

// called for variables which have destructors/finalizers.

auto finiCB = [&](InsertPointTy codeGenIP) {};

llvm::Value *ifCond = nullptr;

if (auto ifExprVar = opInst.getIfExprVar())

ifCond = moduleTranslation.lookupValue(ifExprVar);

llvm::Value *numThreads = nullptr;

if (auto numThreadsVar = opInst.getNumThreadsVar())

numThreads = moduleTranslation.lookupValue(numThreadsVar);

auto pbKind = llvm::omp::OMP_PROC_BIND_default;

if (auto bind = opInst.getProcBindVal())

pbKind = getProcBindKind(*bind);

// TODO: Is the Parallel construct cancellable?

bool isCancellable = false;

llvm::OpenMPIRBuilder::InsertPointTy allocaIP =

findAllocaInsertPoint(builder, moduleTranslation);

llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);

builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createParallel(

ompLoc, allocaIP, bodyGenCB, privCB, finiCB, ifCond, numThreads, pbKind,

isCancellable));

return bodyGenStatus;

}

/// Converts an OpenMP 'master' operation into LLVM IR using OpenMPIRBuilder.

static LogicalResult

convertOmpMaster(Operation &opInst, llvm::IRBuilderBase &builder,

LLVM::ModuleTranslation &moduleTranslation) {

using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;

// TODO: support error propagation in OpenMPIRBuilder and use it instead of

// relying on captured variables.

LogicalResult bodyGenStatus = success();

▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines

convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder,

}

builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createCritical(

ompLoc, bodyGenCB, finiCB, criticalOp.getName().value_or(""), hint));

return success();

}

/// Returns a reduction declaration that corresponds to the given reduction

/// operation in the given container. Currently only supports reductions inside

/// WsLoopOp but can be easily extended.

/// WsLoopOp and ParallelOp but can be easily extended as long as the given

static omp::ReductionDeclareOp findReductionDecl(omp::WsLoopOp container,

/// construct implements getNumReductionVars.

omp::ReductionOp reduction) {

template <typename T>

SymbolRefAttr reductionSymbol;

static std::optional<omp::ReductionDeclareOp>

findReductionDeclInContainer(T container, omp::ReductionOp reduction) {

for (unsigned i = 0, e = container.getNumReductionVars(); i < e; ++i) {

if (container.getReductionVars()[i] != reduction.getAccumulator())

continue;

reductionSymbol = cast<SymbolRefAttr>((*container.getReductions())[i]);

SymbolRefAttr reductionSymbol =

cast<SymbolRefAttr>((*container.getReductions())[i]);

auto declareOp =

SymbolTable::lookupNearestSymbolFrom<omp::ReductionDeclareOp>(

container, reductionSymbol);

return declareOp;

}

return std::nullopt;

}

/// Searches for a reduction in a provided region and the regions

/// it is nested in

static omp::ReductionDeclareOp findReductionDecl(Operation &containerOp,

omp::ReductionOp reduction) {

std::optional<omp::ReductionDeclareOp> declareOp = std::nullopt;

Operation *container = &containerOp;

while (!declareOp.has_value() && container) {

// Check if current container is supported for reductions searches

if (auto par = dyn_cast<omp::ParallelOp>(*container)) {

declareOp = findReductionDeclInContainer(par, reduction);

} else if (auto loop = dyn_cast<omp::WsLoopOp>(*container)) {

declareOp = findReductionDeclInContainer(loop, reduction);

} else {

break;

}

assert(reductionSymbol &&

// See if we can search parent for reductions as well

container = containerOp.getParentOp();

}

assert(declareOp.has_value() &&

kiranchandramohanUnsubmitted

Not Done

while (!declareOp.has_value() && container) {

// Check if current container is supported for reductions searches

if (auto par = dyn_cast<omp::ParallelOp>(*container)) {

declareOp = findReductionDeclInContainer(par, reduction);

} else if (auto loop = dyn_cast<omp::WsLoopOp>(*container)) {

declareOp = findReductionDeclInContainer(loop, reduction);

- }

- // See if we can search parent for reductions as well

- Operation *parent = containerOp.getParentOp();

- if (dyn_cast<omp::ParallelOp>(parent) || dyn_cast<omp::WsLoopOp>(parent)) {

- container = parent;

} else {

break;

}

+ // See if we can search parent for reductions as well

+ container = containerOp.getParentOp();

}

assert(declareOp.has_value() &&

Nit: Would this change work?

kiranchandramohan: Nit: Would this change work?

"reduction operation must be associated with a declaration");

return SymbolTable::lookupNearestSymbolFrom<omp::ReductionDeclareOp>(

return *declareOp;

container, reductionSymbol);

}

/// Populates `reductions` with reduction declarations used in the given loop.

template <typename T>

static void

collectReductionDecls(omp::WsLoopOp loop,

collectReductionDecls(T loop,

SmallVectorImpl<omp::ReductionDeclareOp> &reductions) {

std::optional<ArrayAttr> attr = loop.getReductions();

if (!attr)

return;

reductions.reserve(reductions.size() + loop.getNumReductionVars());

for (auto symbolRef : attr->getAsRange<SymbolRefAttr>()) {

reductions.push_back(

▲ Show 20 Lines • Show All 341 Lines • ▼ Show 20 Lines

convertOmpTaskgroupOp(omp::TaskGroupOp tgOp, llvm::IRBuilderBase &builder,

};

InsertPointTy allocaIP = findAllocaInsertPoint(builder, moduleTranslation);

llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);

builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTaskgroup(

ompLoc, allocaIP, bodyCB));

return bodyGenStatus;

}

/// Allocate space for privatized reduction variables.

template <typename T>

static void

allocReductionVars(T loop, llvm::IRBuilderBase &builder,

LLVM::ModuleTranslation &moduleTranslation,

llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,

SmallVector<omp::ReductionDeclareOp> &reductionDecls,

SmallVector<llvm::Value *> &privateReductionVariables,

DenseMap<Value, llvm::Value *> &reductionVariableMap) {

unsigned numReductions = loop.getNumReductionVars();

privateReductionVariables.reserve(numReductions);

if (numReductions != 0) {

llvm::IRBuilderBase::InsertPointGuard guard(builder);

builder.restoreIP(allocaIP);

for (unsigned i = 0; i < numReductions; ++i) {

llvm::Value *var = builder.CreateAlloca(

moduleTranslation.convertType(reductionDecls[i].getType()));

privateReductionVariables.push_back(var);

reductionVariableMap.try_emplace(loop.getReductionVars()[i], var);

}

/// Collect reduction info

template <typename T>

static void collectReductionInfo(

T loop, llvm::IRBuilderBase &builder,

LLVM::ModuleTranslation &moduleTranslation,

SmallVector<omp::ReductionDeclareOp> &reductionDecls,

SmallVector<OwningReductionGen> &owningReductionGens,

SmallVector<OwningAtomicReductionGen> &owningAtomicReductionGens,

const SmallVector<llvm::Value *> &privateReductionVariables,

SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> &reductionInfos) {

unsigned numReductions = loop.getNumReductionVars();

for (unsigned i = 0; i < numReductions; ++i) {

owningReductionGens.push_back(

makeReductionGen(reductionDecls[i], builder, moduleTranslation));

owningAtomicReductionGens.push_back(

makeAtomicReductionGen(reductionDecls[i], builder, moduleTranslation));

}

// Collect the reduction information.

reductionInfos.reserve(numReductions);

for (unsigned i = 0; i < numReductions; ++i) {

llvm::OpenMPIRBuilder::AtomicReductionGenTy atomicGen = nullptr;

if (owningAtomicReductionGens[i])

atomicGen = owningAtomicReductionGens[i];

llvm::Value *variable =

moduleTranslation.lookupValue(loop.getReductionVars()[i]);

reductionInfos.push_back(

{moduleTranslation.convertType(reductionDecls[i].getType()), variable,

privateReductionVariables[i], owningReductionGens[i], atomicGen});

}

/// Converts an OpenMP workshare loop into LLVM IR using OpenMPIRBuilder.

static LogicalResult

convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,

LLVM::ModuleTranslation &moduleTranslation) {

auto loop = cast<omp::WsLoopOp>(opInst);

// TODO: this should be in the op verifier instead.

if (loop.getLowerBound().empty())

return failure();

Show All 12 Lines

if (loop.getScheduleChunkVar()) {

chunk = builder.CreateSExtOrTrunc(chunkVar, ivType);

}

SmallVector<omp::ReductionDeclareOp> reductionDecls;

collectReductionDecls(loop, reductionDecls);

llvm::OpenMPIRBuilder::InsertPointTy allocaIP =

findAllocaInsertPoint(builder, moduleTranslation);

// Allocate space for privatized reduction variables.

SmallVector<llvm::Value *> privateReductionVariables;

DenseMap<Value, llvm::Value *> reductionVariableMap;

unsigned numReductions = loop.getNumReductionVars();

allocReductionVars(loop, builder, moduleTranslation, allocaIP, reductionDecls,

privateReductionVariables.reserve(numReductions);

privateReductionVariables, reductionVariableMap);

if (numReductions != 0) {

llvm::IRBuilderBase::InsertPointGuard guard(builder);

builder.restoreIP(allocaIP);

for (unsigned i = 0; i < numReductions; ++i) {

llvm::Value *var = builder.CreateAlloca(

moduleTranslation.convertType(reductionDecls[i].getType()));

privateReductionVariables.push_back(var);

reductionVariableMap.try_emplace(loop.getReductionVars()[i], var);

}

// Store the mapping between reduction variables and their private copies on

// ModuleTranslation stack. It can be then recovered when translating

// omp.reduce operations in a separate call.

LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(

moduleTranslation, reductionVariableMap);

// Before the loop, store the initial values of reductions into reduction

// variables. Although this could be done after allocas, we don't want to mess

// up with the alloca insertion point.

for (unsigned i = 0; i < numReductions; ++i) {

for (unsigned i = 0; i < loop.getNumReductionVars(); ++i) {

SmallVector<llvm::Value *> phis;

if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),

"omp.reduction.neutral", builder,

moduleTranslation, &phis)))

return failure();

assert(phis.size() == 1 && "expected one value to be yielded from the "

"reduction neutral element declaration region");

builder.CreateStore(phis[0], privateReductionVariables[i]);

▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines

convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,

// Continue building IR after the loop. Note that the LoopInfo returned by

// `collapseLoops` points inside the outermost loop and is intended for

// potential further loop transformations. Use the insertion point stored

// before collapsing loops instead.

builder.restoreIP(afterIP);

// Process the reductions if required.

if (numReductions == 0)

if (loop.getNumReductionVars() == 0)

return success();

// Create the reduction generators. We need to own them here because

// ReductionInfo only accepts references to the generators.

SmallVector<OwningReductionGen> owningReductionGens;

SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens;

for (unsigned i = 0; i < numReductions; ++i) {

owningReductionGens.push_back(

makeReductionGen(reductionDecls[i], builder, moduleTranslation));

owningAtomicReductionGens.push_back(

makeAtomicReductionGen(reductionDecls[i], builder, moduleTranslation));

}

// Collect the reduction information.

SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos;

reductionInfos.reserve(numReductions);

collectReductionInfo(loop, builder, moduleTranslation, reductionDecls,

for (unsigned i = 0; i < numReductions; ++i) {

owningReductionGens, owningAtomicReductionGens,

llvm::OpenMPIRBuilder::AtomicReductionGenTy atomicGen = nullptr;

privateReductionVariables, reductionInfos);

if (owningAtomicReductionGens[i])

atomicGen = owningAtomicReductionGens[i];

llvm::Value *variable =

moduleTranslation.lookupValue(loop.getReductionVars()[i]);

reductionInfos.push_back(

{moduleTranslation.convertType(reductionDecls[i].getType()), variable,

privateReductionVariables[i], owningReductionGens[i], atomicGen});

}

// The call to createReductions below expects the block to have a

// terminator. Create an unreachable instruction to serve as terminator

// and remove it later.

llvm::UnreachableInst *tempTerminator = builder.CreateUnreachable();

builder.SetInsertPoint(tempTerminator);

llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint =

ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos,

loop.getNowait());

if (!contInsertPoint.getBlock())

return loop->emitOpError() << "failed to convert reductions";

auto nextInsertionPoint =

ompBuilder->createBarrier(contInsertPoint, llvm::omp::OMPD_for);

tempTerminator->eraseFromParent();

builder.restoreIP(nextInsertionPoint);

return success();

}

/// Converts the OpenMP parallel operation to LLVM IR.

static LogicalResult

convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,

LLVM::ModuleTranslation &moduleTranslation) {

using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;

// TODO: support error propagation in OpenMPIRBuilder and use it instead of

// relying on captured variables.

LogicalResult bodyGenStatus = success();

llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();

auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {

// Collect reduction declarations

SmallVector<omp::ReductionDeclareOp> reductionDecls;

collectReductionDecls(opInst, reductionDecls);

// Allocate reduction vars

SmallVector<llvm::Value *> privateReductionVariables;

DenseMap<Value, llvm::Value *> reductionVariableMap;

allocReductionVars(opInst, builder, moduleTranslation, allocaIP,

reductionDecls, privateReductionVariables,

reductionVariableMap);

// Store the mapping between reduction variables and their private copies on

// ModuleTranslation stack. It can be then recovered when translating

// omp.reduce operations in a separate call.

LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(

moduleTranslation, reductionVariableMap);

// Initialize reduction vars

builder.restoreIP(allocaIP);

for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {

SmallVector<llvm::Value *> phis;

if (failed(inlineConvertOmpRegions(

reductionDecls[i].getInitializerRegion(), "omp.reduction.neutral",

builder, moduleTranslation, &phis)))

bodyGenStatus = failure();

assert(phis.size() == 1 &&

"expected one value to be yielded from the "

"reduction neutral element declaration region");

builder.restoreIP(allocaIP);

builder.CreateStore(phis[0], privateReductionVariables[i]);

}

// Save the alloca insertion point on ModuleTranslation stack for use in

// nested regions.

LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(

moduleTranslation, allocaIP);

// ParallelOp has only one region associated with it.

builder.restoreIP(codeGenIP);

auto regionBlock =

convertOmpOpRegions(opInst.getRegion(), "omp.par.region", builder,

moduleTranslation, bodyGenStatus);

// Process the reductions if required.

if (opInst.getNumReductionVars() > 0) {

// Collect reduction info

SmallVector<OwningReductionGen> owningReductionGens;

SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens;

SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos;

collectReductionInfo(opInst, builder, moduleTranslation, reductionDecls,

owningReductionGens, owningAtomicReductionGens,

privateReductionVariables, reductionInfos);

// Move to region cont block

builder.SetInsertPoint(regionBlock->getTerminator());

// Generate reductions from info

llvm::UnreachableInst *tempTerminator = builder.CreateUnreachable();

builder.SetInsertPoint(tempTerminator);

llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint =

ompBuilder->createReductions(builder.saveIP(), allocaIP,

reductionInfos, false);

if (!contInsertPoint.getBlock()) {

bodyGenStatus = opInst->emitOpError() << "failed to convert reductions";

return;

}

tempTerminator->eraseFromParent();

builder.restoreIP(contInsertPoint);

}

};

// TODO: Perform appropriate actions according to the data-sharing

// attribute (shared, private, firstprivate, ...) of variables.

// Currently defaults to shared.

auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,

llvm::Value &, llvm::Value &vPtr,

llvm::Value *&replacementValue) -> InsertPointTy {

replacementValue = &vPtr;

return codeGenIP;

};

// TODO: Perform finalization actions for variables. This has to be

// called for variables which have destructors/finalizers.

auto finiCB = [&](InsertPointTy codeGenIP) {};

llvm::Value *ifCond = nullptr;

if (auto ifExprVar = opInst.getIfExprVar())

ifCond = moduleTranslation.lookupValue(ifExprVar);

llvm::Value *numThreads = nullptr;

if (auto numThreadsVar = opInst.getNumThreadsVar())

numThreads = moduleTranslation.lookupValue(numThreadsVar);

auto pbKind = llvm::omp::OMP_PROC_BIND_default;

if (auto bind = opInst.getProcBindVal())

pbKind = getProcBindKind(*bind);

// TODO: Is the Parallel construct cancellable?

bool isCancellable = false;

llvm::OpenMPIRBuilder::InsertPointTy allocaIP =

findAllocaInsertPoint(builder, moduleTranslation);

llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);

builder.restoreIP(

ompBuilder->createParallel(ompLoc, allocaIP, bodyGenCB, privCB, finiCB,

ifCond, numThreads, pbKind, isCancellable));

return bodyGenStatus;

}

/// Converts an OpenMP simd loop into LLVM IR using OpenMPIRBuilder.

static LogicalResult

convertOmpSimdLoop(Operation &opInst, llvm::IRBuilderBase &builder,

LLVM::ModuleTranslation &moduleTranslation) {

auto loop = cast<omp::SimdLoopOp>(opInst);

llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);

▲ Show 20 Lines • Show All 316 Lines • ▼ Show 20 Lines

builder.restoreIP(ompBuilder->createAtomicCapture(

ompLoc, allocaIP, llvmAtomicX, llvmAtomicV, llvmExpr, atomicOrdering,

binop, updateFn, atomicUpdateOp, isPostfixUpdate, isXBinopExpr));

return updateGenStatus;

}

/// Converts an OpenMP reduction operation using OpenMPIRBuilder. Expects the

/// mapping between reduction variables and their private equivalents to have

/// been stored on the ModuleTranslation stack. Currently only supports

/// reduction within WsLoopOp, but can be easily extended.

/// reduction within WsLoopOp and ParallelOp, but can be easily extended.

static LogicalResult

convertOmpReductionOp(omp::ReductionOp reductionOp,

llvm::IRBuilderBase &builder,

LLVM::ModuleTranslation &moduleTranslation) {

// Find the declaration that corresponds to the reduction op.

auto reductionContainer = reductionOp->getParentOfType<omp::WsLoopOp>();

omp::ReductionDeclareOp declaration;

omp::ReductionDeclareOp declaration =

Operation *reductionParent = reductionOp->getParentOp();

findReductionDecl(reductionContainer, reductionOp);

if (dyn_cast<omp::ParallelOp>(reductionParent) ||

dyn_cast<omp::WsLoopOp>(reductionParent)) {

declaration = findReductionDecl(*reductionParent, reductionOp);

} else {

llvm_unreachable("Unhandled reduction container");

}

assert(declaration && "could not find reduction declaration");

// Retrieve the mapping between reduction variables and their private

// equivalents.

const DenseMap<Value, llvm::Value *> *reductionVariableMap = nullptr;

moduleTranslation.stackWalk<OpenMPVarMappingStackFrame>(

[&](const OpenMPVarMappingStackFrame &frame) {

if (frame.mapping.contains(reductionOp.getAccumulator())) {

reductionVariableMap = &frame.mapping;

return WalkResult::interrupt();

}

return WalkResult::advance();

});

assert(reductionVariableMap && "couldn't find private reduction variables");

// Translate the reduction operation by emitting the body of the corresponding

// reduction declaration.

Region &reductionRegion = declaration.getReductionRegion();

llvm::Value *privateReductionVar =

reductionVariableMap->lookup(reductionOp.getAccumulator());

llvm::Value *reductionVal = builder.CreateLoad(

moduleTranslation.convertType(reductionOp.getOperand().getType()),

privateReductionVar);

▲ Show 20 Lines • Show All 658 Lines • Show Last 20 Lines

mlir/test/Target/LLVMIR/openmp-reduction.mlir

	Show First 20 Lines • Show All 410 Lines • ▼ Show 20 Lines
	// CHECK: %[[PARTIAL2:.+]] = load float, ptr %[[PRIVATE2]]			// CHECK: %[[PARTIAL2:.+]] = load float, ptr %[[PRIVATE2]]
	// CHECK: %[[UPDATED2:.+]] = fmul float %[[PARTIAL2]], 2.000000e+00			// CHECK: %[[UPDATED2:.+]] = fmul float %[[PARTIAL2]], 2.000000e+00
	// CHECK: store float %[[UPDATED2]], ptr %[[PRIVATE2]]			// CHECK: store float %[[UPDATED2]], ptr %[[PRIVATE2]]

	// Reduction function.			// Reduction function.
	// CHECK: define internal void @[[REDFUNC]]			// CHECK: define internal void @[[REDFUNC]]
	// CHECK: fadd float			// CHECK: fadd float
	// CHECK: fmul float			// CHECK: fmul float

				// -----

				omp.reduction.declare @add_f32 : f32
				init {
				^bb0(%arg: f32):
				%0 = llvm.mlir.constant(0.0 : f32) : f32
				omp.yield (%0 : f32)
				}
				combiner {
				^bb1(%arg0: f32, %arg1: f32):
				%1 = llvm.fadd %arg0, %arg1 : f32
				omp.yield (%1 : f32)
				}
				atomic {
				^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr):
				%2 = llvm.load %arg3 : !llvm.ptr -> f32
				llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32
				omp.yield
				}

				// CHECK-LABEL: @simple_reduction_parallel
				llvm.func @simple_reduction_parallel() {
				%c1 = llvm.mlir.constant(1 : i32) : i32
				%0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
				omp.parallel reduction(@add_f32 -> %0 : !llvm.ptr) {
				%1 = llvm.mlir.constant(2.0 : f32) : f32
				omp.reduction %1, %0 : f32, !llvm.ptr
				omp.terminator
				}
				llvm.return
				}

				// Call to the outlined function.
				// CHECK: call void {{.*}} @__kmpc_fork_call
				// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]

				// Outlined function.
				// CHECK: define internal void @[[OUTLINED]]

				// Private reduction variable and its initialization.
				// CHECK: %[[PRIVATE:.+]] = alloca float
				// CHECK: store float 0.000000e+00, ptr %[[PRIVATE]]

				// Update of the private variable
				// CHECK: %[[PARTIAL:.+]] = load float, ptr %[[PRIVATE]]
				// CHECK: %[[UPDATED:.+]] = fadd float %[[PARTIAL]], 2.000000e+00
				// CHECK: store float %[[UPDATED]], ptr %[[PRIVATE]]

				// Call to the reduction function.
				// CHECK: call i32 @__kmpc_reduce
				// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]

				// Atomic reduction.
				// CHECK: %[[PARTIAL:.+]] = load float, ptr %[[PRIVATE]]
				// CHECK: atomicrmw fadd ptr %{{.*}}, float %[[PARTIAL]]

				// Non-atomic reduction:
				// CHECK: fadd float
				// CHECK: call void @__kmpc_end_reduce
				// CHECK: br label %[[FINALIZE:.+]]

				// CHECK: [[FINALIZE]]:

				// Reduction function.
				// CHECK: define internal void @[[REDFUNC]]
				// CHECK: fadd float

				// -----

				omp.reduction.declare @add_i32 : i32
				init {
				^bb0(%arg: i32):
				%0 = llvm.mlir.constant(0 : i32) : i32
				omp.yield (%0 : i32)
				}
				combiner {
				^bb1(%arg0: i32, %arg1: i32):
				%1 = llvm.add %arg0, %arg1 : i32
				omp.yield (%1 : i32)
				}
				atomic {
				^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr):
				%2 = llvm.load %arg3 : !llvm.ptr -> i32
				llvm.atomicrmw add %arg2, %2 monotonic : !llvm.ptr, i32
				omp.yield
				}

				// CHECK-LABEL: @parallel_nested_workshare_reduction
				llvm.func @parallel_nested_workshare_reduction(%ub : i64) {
				%c1 = llvm.mlir.constant(1 : i32) : i32
				%0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr

				%lb = llvm.mlir.constant(1 : i64) : i64
				%step = llvm.mlir.constant(1 : i64) : i64

				omp.parallel reduction(@add_i32 -> %0 : !llvm.ptr) {
				omp.wsloop for (%iv) : i64 = (%lb) to (%ub) step (%step) {
				%ival = llvm.trunc %iv : i64 to i32
				omp.reduction %ival, %0 : i32, !llvm.ptr
				omp.yield
				}
				omp.terminator
				}

				llvm.return
				}

				// Call to the outlined function.
				// CHECK: call void {{.*}} @__kmpc_fork_call
				// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]

				// Outlined function.
				// CHECK: define internal void @[[OUTLINED]]

				// Private reduction variable and its initialization.
				// CHECK: %[[PRIVATE:[0-9]+]] = alloca i32
				// CHECK: store i32 0, ptr %[[PRIVATE]]

				// Loop exit:
				// CHECK: call void @__kmpc_barrier

				// Call to the reduction function.
				// CHECK: call i32 @__kmpc_reduce
				// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]

				// Atomic reduction:
				// CHECK: %[[PARTIAL:.+]] = load i32, ptr %[[PRIVATE]]
				// CHECK: atomicrmw add ptr %{{.*}}, i32 %[[PARTIAL]]

				// Non-atomic reduction:
				// CHECK: add i32
				// CHECK: call void @__kmpc_end_reduce

				// Update of the private variable using the reduction region
				// (the body block currently comes after all the other blocks).
				// CHECK: %[[PARTIAL:.+]] = load i32, ptr %[[PRIVATE]]
				// CHECK: %[[UPDATED:.+]] = add i32 %[[PARTIAL]], {{.*}}
				// CHECK: store i32 %[[UPDATED]], ptr %[[PRIVATE]]

				// Reduction function.
				// CHECK: define internal void @[[REDFUNC]]
				// CHECK: add i32

This is an archive of the discontinued LLVM Phabricator instance.

[openmp][mlir] Parallel reduction LLVM IR generationClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 550619

mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

mlir/test/Target/LLVMIR/openmp-reduction.mlir

[openmp][mlir] Parallel reduction LLVM IR generation
ClosedPublic