diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h --- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h +++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h @@ -83,6 +83,9 @@ return valueMapping.lookup(value); } + /// Looks up remapped a list of remapped values. + SmallVector lookupValues(ValueRange values); + /// Stores the mapping between an MLIR block and LLVM IR basic block. void mapBlock(Block *mlir, llvm::BasicBlock *llvm) { auto result = blockMapping.try_emplace(mlir, llvm); @@ -110,6 +113,10 @@ return branchMapping.lookup(op); } + /// Removes the mapping for blocks contained in the region and values defined + /// in these blocks. Does _not_ recur into nested regions. + void forgetMapping(Region ®ion); + /// Returns the LLVM metadata corresponding to a reference to an mlir LLVM /// dialect access group operation. llvm::MDNode *getAccessGroup(Operation &opInst, @@ -134,9 +141,6 @@ /// Converts the type from MLIR LLVM dialect to LLVM. llvm::Type *convertType(Type type); - /// Looks up remapped a list of remapped values. - SmallVector lookupValues(ValueRange values); - /// Returns the MLIR context of the module being translated. MLIRContext &getContext() { return *mlirModule->getContext(); } @@ -168,7 +172,8 @@ /// translate the IR, leaving it at the end of the block. If `ignoreArguments` /// is set, does not produce PHI nodes for the block arguments. Otherwise, the /// PHI nodes are constructed for block arguments but are _not_ connected to - /// the predecessors that may not exist yet. + /// the predecessors that may not exist yet. If `convertOp` is provided, use + /// it to translate operations in the block; otherwise use `convertOperation`. LogicalResult convertBlock(Block &bb, bool ignoreArguments, llvm::IRBuilderBase &builder); diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Operation.h" #include "mlir/Support/LLVM.h" #include "mlir/Target/LLVMIR/ModuleTranslation.h" @@ -33,6 +34,19 @@ : allocaInsertPoint(allocaIP) {} llvm::OpenMPIRBuilder::InsertPointTy allocaInsertPoint; }; + +/// ModuleTranslation stack frame containing the partial mapping between MLIR +/// values and their LLVM IR equivalents. +class OpenMPVarMappingStackFrame + : public LLVM::ModuleTranslation::StackFrameBase< + OpenMPVarMappingStackFrame> { +public: + explicit OpenMPVarMappingStackFrame( + const DenseMap &mapping) + : mapping(mapping) {} + + DenseMap mapping; +}; } // namespace /// Find the insertion point for allocas given the current insertion point for @@ -62,22 +76,66 @@ /// Converts the given region that appears within an OpenMP dialect operation to /// LLVM IR, creating a branch from the `sourceBlock` to the entry block of the /// region, and a branch from any block with an successor-less OpenMP terminator -/// to `continuationBlock`. -static void convertOmpOpRegions(Region ®ion, StringRef blockName, - llvm::BasicBlock &sourceBlock, - llvm::BasicBlock &continuationBlock, - llvm::IRBuilderBase &builder, - LLVM::ModuleTranslation &moduleTranslation, - LogicalResult &bodyGenStatus) { +/// to `continuationBlock`. Populates `continuationBlockPHIs` with the PHI nodes +/// of the continuation block if provided. +static void convertOmpOpRegions( + Region ®ion, StringRef blockName, llvm::BasicBlock &sourceBlock, + llvm::BasicBlock &continuationBlock, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation, LogicalResult &bodyGenStatus, + SmallVectorImpl *continuationBlockPHIs = nullptr) { llvm::LLVMContext &llvmContext = builder.getContext(); for (Block &bb : region) { llvm::BasicBlock *llvmBB = llvm::BasicBlock::Create( - llvmContext, blockName, builder.GetInsertBlock()->getParent()); + llvmContext, blockName, builder.GetInsertBlock()->getParent(), + builder.GetInsertBlock()->getNextNode()); moduleTranslation.mapBlock(&bb, llvmBB); } llvm::Instruction *sourceTerminator = sourceBlock.getTerminator(); + // Terminators (namely YieldOp) may be forwarding values to the region that + // need to be available in the continuation block. Collect the types of these + // operands in preparation of creating PHI nodes. + SmallVector continuationBlockPHITypes; + bool operandsProcessed = false; + unsigned numYields = 0; + for (Block &bb : region.getBlocks()) { + if (omp::YieldOp yield = dyn_cast(bb.getTerminator())) { + if (!operandsProcessed) { + for (unsigned i = 0, e = yield->getNumOperands(); i < e; ++i) { + continuationBlockPHITypes.push_back( + moduleTranslation.convertType(yield->getOperand(i).getType())); + } + operandsProcessed = true; + } else { + assert(continuationBlockPHITypes.size() == yield->getNumOperands() && + "mismatching number of values yielded from the region"); + for (unsigned i = 0, e = yield->getNumOperands(); i < e; ++i) { + llvm::Type *operandType = + moduleTranslation.convertType(yield->getOperand(i).getType()); + (void)operandType; + assert(continuationBlockPHITypes[i] == operandType && + "values of mismatching types yielded from the region"); + } + } + numYields++; + } + } + + // Insert PHI nodes in the continuation block for any values forwarded by the + // terminators in this region. + if (!continuationBlockPHITypes.empty()) + assert( + continuationBlockPHIs && + "expected continuation block PHIs if converted regions yield values"); + if (continuationBlockPHIs) { + llvm::IRBuilderBase::InsertPointGuard guard(builder); + continuationBlockPHIs->reserve(continuationBlockPHITypes.size()); + builder.SetInsertPoint(&continuationBlock, continuationBlock.begin()); + for (llvm::Type *ty : continuationBlockPHITypes) + continuationBlockPHIs->push_back(builder.CreatePHI(ty, numYields)); + } + // Convert blocks one by one in topological order to ensure // defs are converted before uses. SetVector blocks = @@ -108,12 +166,24 @@ // ModuleTranslation class to set up the correct insertion point. This is // also consistent with MLIR's idiom of handling special region terminators // in the same code that handles the region-owning operation. - if (isa(bb->getTerminator())) + Operation *terminator = bb->getTerminator(); + if (isa(terminator)) { builder.CreateBr(&continuationBlock); + + for (unsigned i = 0, e = terminator->getNumOperands(); i < e; ++i) + (*continuationBlockPHIs)[i]->addIncoming( + moduleTranslation.lookupValue(terminator->getOperand(i)), llvmBB); + } } - // Finally, after all blocks have been traversed and values mapped, - // connect the PHI nodes to the results of preceding blocks. + // After all blocks have been traversed and values mapped, connect the PHI + // nodes to the results of preceding blocks. LLVM::detail::connectPHINodes(region, moduleTranslation); + + // Remove the blocks and values defined in this region from the mapping since + // they are not visible outside of this region. This allows the same region to + // be converted several times, that is cloned, without clashes, and slightly + // speeds up the lookups. + moduleTranslation.forgetMapping(region); } /// Converts the OpenMP parallel operation to LLVM IR. @@ -204,6 +274,167 @@ return success(); } +/// Returns a reduction declaration that corresponds to the given reduction +/// operation in the given container. Currently only supports reductions inside +/// WsLoopOp but can be easily extended. +static omp::ReductionDeclareOp findReductionDecl(omp::WsLoopOp container, + omp::ReductionOp reduction) { + SymbolRefAttr reductionSymbol; + for (unsigned i = 0, e = container.getNumReductionVars(); i < e; ++i) { + if (container.reduction_vars()[i] != reduction.accumulator()) + continue; + reductionSymbol = (*container.reductions())[i].cast(); + break; + } + assert(reductionSymbol && + "reduction operation must be associated with a declaration"); + + return SymbolTable::lookupNearestSymbolFrom( + container, reductionSymbol); +} + +/// Populates `reductions` with reduction declarations used in the given loop. +static void +collectReductionDecls(omp::WsLoopOp loop, + SmallVectorImpl &reductions) { + Optional attr = loop.reductions(); + if (!attr) + return; + + reductions.reserve(reductions.size() + loop.getNumReductionVars()); + for (auto symbolRef : attr->getAsRange()) { + reductions.push_back( + SymbolTable::lookupNearestSymbolFrom( + loop, symbolRef)); + } +} + +/// Translates the blocks contained in the given region and appends them to at +/// the current insertion point of `builder`. The operations of the entry block +/// are appended to the current insertion block, which is not expected to have a +/// terminator. If set, `continuationBlockArgs` is populated with translated +/// values that correspond to the values omp.yield'ed from the region. +static LogicalResult inlineConvertOmpRegions( + Region ®ion, StringRef blockName, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation, + SmallVectorImpl *continuationBlockArgs = nullptr) { + if (region.empty()) + return success(); + + // Special case for single-block regions that doesn't create additional + // blocks: insert operations without creating additional blocks. + if (llvm::hasSingleElement(region)) { + moduleTranslation.mapBlock(®ion.front(), builder.GetInsertBlock()); + if (failed(moduleTranslation.convertBlock( + region.front(), /*ignoreArguments=*/true, builder))) + return failure(); + + // The continuation arguments are simply the translated terminator operands. + if (continuationBlockArgs) + llvm::append_range( + *continuationBlockArgs, + moduleTranslation.lookupValues(region.front().back().getOperands())); + + // Drop the mapping that is no longer necessary so that the same region can + // be processed multiple times. + moduleTranslation.forgetMapping(region); + return success(); + } + + // Create the continuation block manually instead of calling splitBlock + // because the current insertion block may not have a terminator. + llvm::BasicBlock *continuationBlock = + llvm::BasicBlock::Create(builder.getContext(), blockName + ".cont", + builder.GetInsertBlock()->getParent(), + builder.GetInsertBlock()->getNextNode()); + builder.CreateBr(continuationBlock); + + LogicalResult bodyGenStatus = success(); + SmallVector phis; + convertOmpOpRegions(region, blockName, *builder.GetInsertBlock(), + *continuationBlock, builder, moduleTranslation, + bodyGenStatus, &phis); + if (failed(bodyGenStatus)) + return failure(); + if (continuationBlockArgs) + llvm::append_range(*continuationBlockArgs, phis); + builder.SetInsertPoint(continuationBlock, + continuationBlock->getFirstInsertionPt()); + return success(); +} + +namespace { +/// Owning equivalents of OpenMPIRBuilder::(Atomic)ReductionGen that are used to +/// store lambdas with capture. +using OwningReductionGen = std::function; +using OwningAtomicReductionGen = + std::function; +} // namespace + +/// Create an OpenMPIRBuilder-compatible reduction generator for the given +/// reduction declaration. The generator uses `builder` but ignores its +/// insertion point. +static OwningReductionGen +makeReductionGen(omp::ReductionDeclareOp decl, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + // The lambda is mutable because we need access to non-const methods of decl + // (which aren't actually mutating it), and we must capture decl by-value to + // avoid the dangling reference after the parent function returns. + OwningReductionGen gen = + [&, decl](llvm::OpenMPIRBuilder::InsertPointTy insertPoint, + llvm::Value *lhs, llvm::Value *rhs, + llvm::Value *&result) mutable { + Region &reductionRegion = decl.reductionRegion(); + moduleTranslation.mapValue(reductionRegion.front().getArgument(0), lhs); + moduleTranslation.mapValue(reductionRegion.front().getArgument(1), rhs); + builder.restoreIP(insertPoint); + SmallVector phis; + if (failed(inlineConvertOmpRegions(reductionRegion, + "omp.reduction.nonatomic.body", + builder, moduleTranslation, &phis))) + return llvm::OpenMPIRBuilder::InsertPointTy(); + assert(phis.size() == 1); + result = phis[0]; + return builder.saveIP(); + }; + return gen; +} + +/// Create an OpenMPIRBuilder-compatible atomic reduction generator for the +/// given reduction declaration. The generator uses `builder` but ignores its +/// insertion point. Returns null if there is no atomic region available in the +/// reduction declaration. +static OwningAtomicReductionGen +makeAtomicReductionGen(omp::ReductionDeclareOp decl, + llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + if (decl.atomicReductionRegion().empty()) + return OwningAtomicReductionGen(); + + // The lambda is mutable because we need access to non-const methods of decl + // (which aren't actually mutating it), and we must capture decl by-value to + // avoid the dangling reference after the parent function returns. + OwningAtomicReductionGen atomicGen = + [&, decl](llvm::OpenMPIRBuilder::InsertPointTy insertPoint, + llvm::Value *lhs, llvm::Value *rhs) mutable { + Region &atomicRegion = decl.atomicReductionRegion(); + moduleTranslation.mapValue(atomicRegion.front().getArgument(0), lhs); + moduleTranslation.mapValue(atomicRegion.front().getArgument(1), rhs); + builder.restoreIP(insertPoint); + SmallVector phis; + if (failed(inlineConvertOmpRegions(atomicRegion, + "omp.reduction.atomic.body", builder, + moduleTranslation, &phis))) + return llvm::OpenMPIRBuilder::InsertPointTy(); + assert(phis.empty()); + return builder.saveIP(); + }; + return atomicGen; +} + /// Converts an OpenMP workshare loop into LLVM IR using OpenMPIRBuilder. static LogicalResult convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder, @@ -232,6 +463,49 @@ ? moduleTranslation.lookupValue(loop.schedule_chunk_var()) : llvm::ConstantInt::get(ivType, 1); + SmallVector reductionDecls; + collectReductionDecls(loop, reductionDecls); + llvm::OpenMPIRBuilder::InsertPointTy allocaIP = + findAllocaInsertPoint(builder, moduleTranslation); + + // Allocate space for privatized reduction variables. + SmallVector privateReductionVariables; + DenseMap reductionVariableMap; + unsigned numReductions = loop.getNumReductionVars(); + privateReductionVariables.reserve(numReductions); + if (numReductions != 0) { + llvm::IRBuilderBase::InsertPointGuard guard(builder); + builder.restoreIP(allocaIP); + for (unsigned i = 0; i < numReductions; ++i) { + auto reductionType = + loop.reduction_vars()[i].getType().cast(); + llvm::Value *var = builder.CreateAlloca( + moduleTranslation.convertType(reductionType.getElementType())); + privateReductionVariables.push_back(var); + reductionVariableMap.try_emplace(loop.reduction_vars()[i], var); + } + } + + // Store the mapping between reduction variables and their private copies on + // ModuleTranslation stack. It can be then recovered when translating + // omp.reduce operations in a separate call. + LLVM::ModuleTranslation::SaveStack mappingGuard( + moduleTranslation, reductionVariableMap); + + // Before the loop, store the initial values of reductions into reduction + // variables. Although this could be done after allocas, we don't want to mess + // up with the alloca insertion point. + for (unsigned i = 0; i < numReductions; ++i) { + SmallVector phis; + if (failed(inlineConvertOmpRegions(reductionDecls[i].initializerRegion(), + "omp.reduction.neutral", builder, + moduleTranslation, &phis))) + return failure(); + assert(phis.size() == 1 && "expected one value to be yielded from the " + "reduction neutral element declaration region"); + builder.CreateStore(phis[0], privateReductionVariables[i]); + } + // Set up the source location value for OpenMP runtime. llvm::DISubprogram *subprogram = builder.GetInsertBlock()->getParent()->getSubprogram(); @@ -271,8 +545,7 @@ if (failed(bodyGenStatus)) return failure(); - llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + allocaIP = findAllocaInsertPoint(builder, moduleTranslation); llvm::OpenMPIRBuilder::InsertPointTy afterIP; llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); if (schedule == omp::ClauseScheduleKind::Static) { @@ -304,7 +577,100 @@ } // Continue building IR after the loop. - builder.restoreIP(afterIP); + builder.restoreIP(loopInfo->getAfterIP()); + + // Process the reductions if required. + if (numReductions == 0) + return success(); + + // Create the reduction generators. We need to own them here because + // ReductionInfo only accepts references to the generators. + SmallVector owningReductionGens; + SmallVector owningAtomicReductionGens; + for (unsigned i = 0; i < numReductions; ++i) { + owningReductionGens.push_back( + makeReductionGen(reductionDecls[i], builder, moduleTranslation)); + owningAtomicReductionGens.push_back( + makeAtomicReductionGen(reductionDecls[i], builder, moduleTranslation)); + } + + // Collect the reduction information. + SmallVector reductionInfos; + reductionInfos.reserve(numReductions); + for (unsigned i = 0; i < numReductions; ++i) { + llvm::OpenMPIRBuilder::AtomicReductionGenTy atomicGen = nullptr; + if (owningAtomicReductionGens[i]) + atomicGen = owningAtomicReductionGens[i]; + reductionInfos.push_back( + {moduleTranslation.lookupValue(loop.reduction_vars()[i]), + privateReductionVariables[i], owningReductionGens[i], atomicGen}); + } + + // The call to createReductions below expects the block to have a + // terminator. Create an unreachable instruction to serve as terminator + // and remove it later. + llvm::UnreachableInst *tempTerminator = builder.CreateUnreachable(); + builder.SetInsertPoint(tempTerminator); + llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint = + moduleTranslation.getOpenMPBuilder()->createReductions( + builder.saveIP(), allocaIP, reductionInfos, + /*IsNoWait=*/false); + if (!contInsertPoint.getBlock()) + return loop->emitOpError() << "failed to convert reductions"; + auto nextInsertionPoint = moduleTranslation.getOpenMPBuilder()->createBarrier( + contInsertPoint, llvm::omp::OMPD_for); + tempTerminator->eraseFromParent(); + builder.restoreIP(nextInsertionPoint); + + return success(); +} + +/// Converts an OpenMP reduction operation using OpenMPIRBuilder. Expects the +/// mapping between reduction variables and their private equivalents to have +/// been stored on the ModuleTranslation stack. Currently only supports +/// reduction within WsLoopOp, but can be easily extended. +static LogicalResult +convertOmpReductionOp(omp::ReductionOp reductionOp, + llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + // Find the declaration that corresponds to the reduction op. + auto reductionContainer = reductionOp->getParentOfType(); + omp::ReductionDeclareOp declaration = + findReductionDecl(reductionContainer, reductionOp); + assert(declaration && "could not find reduction declaration"); + + // Retrieve the mapping between reduction variables and their private + // equivalents. + const DenseMap *reductionVariableMap = nullptr; + moduleTranslation.stackWalk( + [&](const OpenMPVarMappingStackFrame &frame) { + reductionVariableMap = &frame.mapping; + return WalkResult::interrupt(); + }); + assert(reductionVariableMap && "couldn't find private reduction variables"); + + // Translate the reduction operation by emitting the body of the corresponding + // reduction declaration. + Region &reductionRegion = declaration.reductionRegion(); + llvm::Value *privateReductionVar = + reductionVariableMap->lookup(reductionOp.accumulator()); + llvm::Value *reductionVal = builder.CreateLoad( + moduleTranslation.convertType(reductionOp.operand().getType()), + privateReductionVar); + + moduleTranslation.mapValue(reductionRegion.front().getArgument(0), + reductionVal); + moduleTranslation.mapValue( + reductionRegion.front().getArgument(1), + moduleTranslation.lookupValue(reductionOp.operand())); + + SmallVector phis; + if (failed(inlineConvertOmpRegions(reductionRegion, "omp.reduction.body", + builder, moduleTranslation, &phis))) + return failure(); + assert(phis.size() == 1 && "expected one value to be yielded from " + "the reduction body declaration region"); + builder.CreateStore(phis[0], privateReductionVar); return success(); } @@ -362,6 +728,9 @@ .Case([&](omp::ParallelOp) { return convertOmpParallel(*op, builder, moduleTranslation); }) + .Case([&](omp::ReductionOp reductionOp) { + return convertOmpReductionOp(reductionOp, builder, moduleTranslation); + }) .Case([&](omp::MasterOp) { return convertOmpMaster(*op, builder, moduleTranslation); }) @@ -370,9 +739,7 @@ }) .Case([](auto op) { // `yield` and `terminator` can be just omitted. The block structure was - // created in the function that handles their parent operation. - assert(op->getNumOperands() == 0 && - "unexpected OpenMP terminator with operands"); + // created in the region that handles their parent operation. return success(); }) .Default([&](Operation *inst) { diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -224,6 +224,23 @@ ompBuilder->finalize(); } +void ModuleTranslation::forgetMapping(Region ®ion) { + for (Block &block : region) { + blockMapping.erase(&block); + for (Value arg : block.getArguments()) + valueMapping.erase(arg); + for (Operation &op : block) { + for (Value value : op.getResults()) + valueMapping.erase(value); + if (op.hasSuccessors()) + branchMapping.erase(&op); + if (isa(op)) + globalsMapping.erase(&op); + accessGroupMetadataMapping.erase(&op); + } + } +} + /// Get the SSA value passed to the current block from the terminator operation /// of its predecessor. static Value getPHISourceValue(Block *current, Block *pred, @@ -689,7 +706,8 @@ /// Check whether the module contains only supported ops directly in its body. static LogicalResult checkSupportedModuleOps(Operation *m) { for (Operation &o : getModuleBody(m).getOperations()) - if (!isa(&o) && + if (!isa(&o) && !o.hasTrait()) return o.emitOpError("unsupported module-level operation"); return success(); @@ -778,10 +796,9 @@ return typeTranslator.translateType(type); } -/// A helper to look up remapped operands in the value remapping table.` -SmallVector -ModuleTranslation::lookupValues(ValueRange values) { - SmallVector remapped; +/// A helper to look up remapped operands in the value remapping table. +SmallVector ModuleTranslation::lookupValues(ValueRange values) { + SmallVector remapped; remapped.reserve(values.size()); for (Value v : values) remapped.push_back(lookupValue(v)); diff --git a/mlir/test/Target/LLVMIR/openmp-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-reduction.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-reduction.mlir @@ -0,0 +1,418 @@ +// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s + +// Only check the overall shape of the code and the presence of relevant +// runtime calls. Actual IR checking is done at the OpenMPIRBuilder level. + +omp.reduction.declare @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr + llvm.atomicrmw fadd %arg2, %2 monotonic : f32 + omp.yield +} + +// CHECK-LABEL: @simple_reduction +llvm.func @simple_reduction(%lb : i64, %ub : i64, %step : i64) { + %c1 = llvm.mlir.constant(1 : i32) : i32 + %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr + omp.parallel { + omp.wsloop (%iv) : i64 = (%lb) to (%ub) step (%step) + reduction(@add_f32 -> %0 : !llvm.ptr) { + %1 = llvm.mlir.constant(2.0 : f32) : f32 + omp.reduction %1, %0 : !llvm.ptr + omp.yield + } + omp.terminator + } + llvm.return +} + +// Call to the outlined function. +// CHECK: call void {{.*}} @__kmpc_fork_call +// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Outlined function. +// CHECK: define internal void @[[OUTLINED]] + +// Private reduction variable and its initialization. +// CHECK: %[[PRIVATE:.+]] = alloca float +// CHECK: store float 0.000000e+00, float* %[[PRIVATE]] + +// Call to the reduction function. +// CHECK: call i32 @__kmpc_reduce +// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Atomic reduction. +// CHECK: %[[PARTIAL:.+]] = load float, float* %[[PRIVATE]] +// CHECK: atomicrmw fadd float* %{{.*}}, float %[[PARTIAL]] + +// Non-atomic reduction: +// CHECK: fadd float +// CHECK: call void @__kmpc_end_reduce +// CHECK: br label %[[FINALIZE:.+]] + +// CHECK: [[FINALIZE]]: +// CHECK: call void @__kmpc_barrier + +// Update of the private variable using the reduciton region +// (the body block currently comes after all the other blocks). +// CHECK: %[[PARTIAL:.+]] = load float, float* %[[PRIVATE]] +// CHECK: %[[UPDATED:.+]] = fadd float %[[PARTIAL]], 2.000000e+00 +// CHECK: store float %[[UPDATED]], float* %[[PRIVATE]] + +// Reduction function. +// CHECK: define internal void @[[REDFUNC]] +// CHECK: fadd float + +// ----- + +omp.reduction.declare @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr + llvm.atomicrmw fadd %arg2, %2 monotonic : f32 + omp.yield +} + +// When the same reduction declaration is used several times, its regions +// are translated several times, which shouldn't lead to value/block +// remapping assertions. +// CHECK-LABEL: @reuse_declaration +llvm.func @reuse_declaration(%lb : i64, %ub : i64, %step : i64) { + %c1 = llvm.mlir.constant(1 : i32) : i32 + %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr + %2 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr + omp.parallel { + omp.wsloop (%iv) : i64 = (%lb) to (%ub) step (%step) + reduction(@add_f32 -> %0 : !llvm.ptr, @add_f32 -> %2 : !llvm.ptr) { + %1 = llvm.mlir.constant(2.0 : f32) : f32 + omp.reduction %1, %0 : !llvm.ptr + omp.reduction %1, %2 : !llvm.ptr + omp.yield + } + omp.terminator + } + llvm.return +} + +// Call to the outlined function. +// CHECK: call void {{.*}} @__kmpc_fork_call +// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Outlined function. +// CHECK: define internal void @[[OUTLINED]] + +// Private reduction variable and its initialization. +// CHECK: %[[PRIVATE1:.+]] = alloca float +// CHECK: %[[PRIVATE2:.+]] = alloca float +// CHECK: store float 0.000000e+00, float* %[[PRIVATE1]] +// CHECK: store float 0.000000e+00, float* %[[PRIVATE2]] + +// Call to the reduction function. +// CHECK: call i32 @__kmpc_reduce +// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Atomic reduction. +// CHECK: %[[PARTIAL1:.+]] = load float, float* %[[PRIVATE1]] +// CHECK: atomicrmw fadd float* %{{.*}}, float %[[PARTIAL1]] +// CHECK: %[[PARTIAL2:.+]] = load float, float* %[[PRIVATE2]] +// CHECK: atomicrmw fadd float* %{{.*}}, float %[[PARTIAL2]] + +// Non-atomic reduction: +// CHECK: fadd float +// CHECK: fadd float +// CHECK: call void @__kmpc_end_reduce +// CHECK: br label %[[FINALIZE:.+]] + +// CHECK: [[FINALIZE]]: +// CHECK: call void @__kmpc_barrier + +// Update of the private variable using the reduciton region +// (the body block currently comes after all the other blocks). +// CHECK: %[[PARTIAL1:.+]] = load float, float* %[[PRIVATE1]] +// CHECK: %[[UPDATED1:.+]] = fadd float %[[PARTIAL1]], 2.000000e+00 +// CHECK: store float %[[UPDATED1]], float* %[[PRIVATE1]] +// CHECK: %[[PARTIAL2:.+]] = load float, float* %[[PRIVATE2]] +// CHECK: %[[UPDATED2:.+]] = fadd float %[[PARTIAL2]], 2.000000e+00 +// CHECK: store float %[[UPDATED2]], float* %[[PRIVATE2]] + +// Reduction function. +// CHECK: define internal void @[[REDFUNC]] +// CHECK: fadd float +// CHECK: fadd float + + +// ----- + +omp.reduction.declare @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr + llvm.atomicrmw fadd %arg2, %2 monotonic : f32 + omp.yield +} + +// It's okay not to reference the reduction variable in the body. +// CHECK-LABEL: @missing_omp_reduction +llvm.func @missing_omp_reduction(%lb : i64, %ub : i64, %step : i64) { + %c1 = llvm.mlir.constant(1 : i32) : i32 + %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr + %2 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr + omp.parallel { + omp.wsloop (%iv) : i64 = (%lb) to (%ub) step (%step) + reduction(@add_f32 -> %0 : !llvm.ptr, @add_f32 -> %2 : !llvm.ptr) { + %1 = llvm.mlir.constant(2.0 : f32) : f32 + omp.reduction %1, %0 : !llvm.ptr + omp.yield + } + omp.terminator + } + llvm.return +} + +// Call to the outlined function. +// CHECK: call void {{.*}} @__kmpc_fork_call +// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Outlined function. +// CHECK: define internal void @[[OUTLINED]] + +// Private reduction variable and its initialization. +// CHECK: %[[PRIVATE1:.+]] = alloca float +// CHECK: %[[PRIVATE2:.+]] = alloca float +// CHECK: store float 0.000000e+00, float* %[[PRIVATE1]] +// CHECK: store float 0.000000e+00, float* %[[PRIVATE2]] + +// Call to the reduction function. +// CHECK: call i32 @__kmpc_reduce +// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Atomic reduction. +// CHECK: %[[PARTIAL1:.+]] = load float, float* %[[PRIVATE1]] +// CHECK: atomicrmw fadd float* %{{.*}}, float %[[PARTIAL1]] +// CHECK: %[[PARTIAL2:.+]] = load float, float* %[[PRIVATE2]] +// CHECK: atomicrmw fadd float* %{{.*}}, float %[[PARTIAL2]] + +// Non-atomic reduction: +// CHECK: fadd float +// CHECK: fadd float +// CHECK: call void @__kmpc_end_reduce +// CHECK: br label %[[FINALIZE:.+]] + +// CHECK: [[FINALIZE]]: +// CHECK: call void @__kmpc_barrier + +// Update of the private variable using the reduciton region +// (the body block currently comes after all the other blocks). +// CHECK: %[[PARTIAL1:.+]] = load float, float* %[[PRIVATE1]] +// CHECK: %[[UPDATED1:.+]] = fadd float %[[PARTIAL1]], 2.000000e+00 +// CHECK: store float %[[UPDATED1]], float* %[[PRIVATE1]] +// CHECK-NOT: %{{.*}} = load float, float* %[[PRIVATE2]] +// CHECK-NOT: %{{.*}} = fadd float %[[PARTIAL2]], 2.000000e+00 + +// Reduction function. +// CHECK: define internal void @[[REDFUNC]] +// CHECK: fadd float +// CHECK: fadd float + +// ----- + +omp.reduction.declare @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr + llvm.atomicrmw fadd %arg2, %2 monotonic : f32 + omp.yield +} + +// It's okay to refer to the same reduction variable more than once in the +// body. +// CHECK-LABEL: @double_reference +llvm.func @double_reference(%lb : i64, %ub : i64, %step : i64) { + %c1 = llvm.mlir.constant(1 : i32) : i32 + %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr + omp.parallel { + omp.wsloop (%iv) : i64 = (%lb) to (%ub) step (%step) + reduction(@add_f32 -> %0 : !llvm.ptr) { + %1 = llvm.mlir.constant(2.0 : f32) : f32 + omp.reduction %1, %0 : !llvm.ptr + omp.reduction %1, %0 : !llvm.ptr + omp.yield + } + omp.terminator + } + llvm.return +} + +// Call to the outlined function. +// CHECK: call void {{.*}} @__kmpc_fork_call +// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Outlined function. +// CHECK: define internal void @[[OUTLINED]] + +// Private reduction variable and its initialization. +// CHECK: %[[PRIVATE:.+]] = alloca float +// CHECK: store float 0.000000e+00, float* %[[PRIVATE]] + +// Call to the reduction function. +// CHECK: call i32 @__kmpc_reduce +// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Atomic reduction. +// CHECK: %[[PARTIAL:.+]] = load float, float* %[[PRIVATE]] +// CHECK: atomicrmw fadd float* %{{.*}}, float %[[PARTIAL]] + +// Non-atomic reduction: +// CHECK: fadd float +// CHECK: call void @__kmpc_end_reduce +// CHECK: br label %[[FINALIZE:.+]] + +// CHECK: [[FINALIZE]]: +// CHECK: call void @__kmpc_barrier + +// Update of the private variable using the reduciton region +// (the body block currently comes after all the other blocks). +// CHECK: %[[PARTIAL:.+]] = load float, float* %[[PRIVATE]] +// CHECK: %[[UPDATED:.+]] = fadd float %[[PARTIAL]], 2.000000e+00 +// CHECK: store float %[[UPDATED]], float* %[[PRIVATE]] +// CHECK: %[[PARTIAL:.+]] = load float, float* %[[PRIVATE]] +// CHECK: %[[UPDATED:.+]] = fadd float %[[PARTIAL]], 2.000000e+00 +// CHECK: store float %[[UPDATED]], float* %[[PRIVATE]] + +// Reduction function. +// CHECK: define internal void @[[REDFUNC]] +// CHECK: fadd float + +// ----- + +omp.reduction.declare @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr + llvm.atomicrmw fadd %arg2, %2 monotonic : f32 + omp.yield +} + +omp.reduction.declare @mul_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(1.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fmul %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} + +// CHECK-LABEL: @no_atomic +llvm.func @no_atomic(%lb : i64, %ub : i64, %step : i64) { + %c1 = llvm.mlir.constant(1 : i32) : i32 + %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr + %2 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr + omp.parallel { + omp.wsloop (%iv) : i64 = (%lb) to (%ub) step (%step) + reduction(@add_f32 -> %0 : !llvm.ptr, @mul_f32 -> %2 : !llvm.ptr) { + %1 = llvm.mlir.constant(2.0 : f32) : f32 + omp.reduction %1, %0 : !llvm.ptr + omp.reduction %1, %2 : !llvm.ptr + omp.yield + } + omp.terminator + } + llvm.return +} + +// Call to the outlined function. +// CHECK: call void {{.*}} @__kmpc_fork_call +// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Outlined function. +// CHECK: define internal void @[[OUTLINED]] + +// Private reduction variable and its initialization. +// CHECK: %[[PRIVATE1:.+]] = alloca float +// CHECK: %[[PRIVATE2:.+]] = alloca float +// CHECK: store float 0.000000e+00, float* %[[PRIVATE1]] +// CHECK: store float 1.000000e+00, float* %[[PRIVATE2]] + +// Call to the reduction function. +// CHECK: call i32 @__kmpc_reduce +// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Atomic reduction not provided. +// CHECK: unreachable + +// Non-atomic reduction: +// CHECK: fadd float +// CHECK: fmul float +// CHECK: call void @__kmpc_end_reduce +// CHECK: br label %[[FINALIZE:.+]] + +// CHECK: [[FINALIZE]]: +// CHECK: call void @__kmpc_barrier + +// Update of the private variable using the reduciton region +// (the body block currently comes after all the other blocks). +// CHECK: %[[PARTIAL1:.+]] = load float, float* %[[PRIVATE1]] +// CHECK: %[[UPDATED1:.+]] = fadd float %[[PARTIAL1]], 2.000000e+00 +// CHECK: store float %[[UPDATED1]], float* %[[PRIVATE1]] +// CHECK: %[[PARTIAL2:.+]] = load float, float* %[[PRIVATE2]] +// CHECK: %[[UPDATED2:.+]] = fmul float %[[PARTIAL2]], 2.000000e+00 +// CHECK: store float %[[UPDATED2]], float* %[[PRIVATE2]] + +// Reduction function. +// CHECK: define internal void @[[REDFUNC]] +// CHECK: fadd float +// CHECK: fmul float