diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -188,6 +188,10 @@ let builders = [ OpBuilder<(ins CArg<"ArrayRef", "{}">:$attributes)> ]; + let extraClassDeclaration = [{ + /// Returns the number of reduction variables. + unsigned getNumReductionVars() { return getReductionVars().size(); } + }]; let assemblyFormat = [{ oilist( `reduction` `(` custom( diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -253,64 +253,6 @@ llvm_unreachable("Unknown ClauseProcBindKind kind"); } -/// Converts the OpenMP parallel operation to LLVM IR. -static LogicalResult -convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, - LLVM::ModuleTranslation &moduleTranslation) { - using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; - // TODO: support error propagation in OpenMPIRBuilder and use it instead of - // relying on captured variables. - LogicalResult bodyGenStatus = success(); - - auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) { - // Save the alloca insertion point on ModuleTranslation stack for use in - // nested regions. - LLVM::ModuleTranslation::SaveStack frame( - moduleTranslation, allocaIP); - - // ParallelOp has only one region associated with it. - builder.restoreIP(codeGenIP); - convertOmpOpRegions(opInst.getRegion(), "omp.par.region", builder, - moduleTranslation, bodyGenStatus); - }; - - // TODO: Perform appropriate actions according to the data-sharing - // attribute (shared, private, firstprivate, ...) of variables. - // Currently defaults to shared. - auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP, - llvm::Value &, llvm::Value &vPtr, - llvm::Value *&replacementValue) -> InsertPointTy { - replacementValue = &vPtr; - - return codeGenIP; - }; - - // TODO: Perform finalization actions for variables. This has to be - // called for variables which have destructors/finalizers. - auto finiCB = [&](InsertPointTy codeGenIP) {}; - - llvm::Value *ifCond = nullptr; - if (auto ifExprVar = opInst.getIfExprVar()) - ifCond = moduleTranslation.lookupValue(ifExprVar); - llvm::Value *numThreads = nullptr; - if (auto numThreadsVar = opInst.getNumThreadsVar()) - numThreads = moduleTranslation.lookupValue(numThreadsVar); - auto pbKind = llvm::omp::OMP_PROC_BIND_default; - if (auto bind = opInst.getProcBindVal()) - pbKind = getProcBindKind(*bind); - // TODO: Is the Parallel construct cancellable? - bool isCancellable = false; - - llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); - llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); - builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createParallel( - ompLoc, allocaIP, bodyGenCB, privCB, finiCB, ifCond, numThreads, pbKind, - isCancellable)); - - return bodyGenStatus; -} - /// Converts an OpenMP 'master' operation into LLVM IR using OpenMPIRBuilder. static LogicalResult convertOmpMaster(Operation &opInst, llvm::IRBuilderBase &builder, @@ -383,26 +325,59 @@ /// Returns a reduction declaration that corresponds to the given reduction /// operation in the given container. Currently only supports reductions inside -/// WsLoopOp but can be easily extended. -static omp::ReductionDeclareOp findReductionDecl(omp::WsLoopOp container, - omp::ReductionOp reduction) { - SymbolRefAttr reductionSymbol; +/// WsLoopOp and ParallelOp but can be easily extended as long as the given +/// construct implements getNumReductionVars. +template +static std::optional +findReductionDeclInContainer(T container, omp::ReductionOp reduction) { for (unsigned i = 0, e = container.getNumReductionVars(); i < e; ++i) { if (container.getReductionVars()[i] != reduction.getAccumulator()) continue; - reductionSymbol = cast((*container.getReductions())[i]); - break; + + SymbolRefAttr reductionSymbol = + cast((*container.getReductions())[i]); + auto declareOp = + SymbolTable::lookupNearestSymbolFrom( + container, reductionSymbol); + return declareOp; + } + return std::nullopt; +} + +/// Searches for a reduction in a provided region and the regions +/// it is nested in +static omp::ReductionDeclareOp findReductionDecl(Operation &containerOp, + omp::ReductionOp reduction) { + std::optional declareOp = std::nullopt; + Operation *container = &containerOp; + + while (!declareOp.has_value() && container) { + // Check if current container is supported for reductions searches + if (auto par = dyn_cast(*container)) { + declareOp = findReductionDeclInContainer(par, reduction); + } else if (auto loop = dyn_cast(*container)) { + declareOp = findReductionDeclInContainer(loop, reduction); + } + + // See if we can search parent for reductions as well + Operation *parent = containerOp.getParentOp(); + if (dyn_cast(parent) || dyn_cast(parent)) { + container = parent; + } else { + break; + } } - assert(reductionSymbol && + + assert(declareOp.has_value() && "reduction operation must be associated with a declaration"); - return SymbolTable::lookupNearestSymbolFrom( - container, reductionSymbol); + return *declareOp; } /// Populates `reductions` with reduction declarations used in the given loop. +template static void -collectReductionDecls(omp::WsLoopOp loop, +collectReductionDecls(T loop, SmallVectorImpl &reductions) { std::optional attr = loop.getReductions(); if (!attr) @@ -760,6 +735,62 @@ return bodyGenStatus; } +/// Allocate space for privatized reduction variables. +template +static void +allocReductionVars(T loop, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation, + llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, + SmallVector &reductionDecls, + SmallVector &privateReductionVariables, + DenseMap &reductionVariableMap) { + unsigned numReductions = loop.getNumReductionVars(); + privateReductionVariables.reserve(numReductions); + if (numReductions != 0) { + llvm::IRBuilderBase::InsertPointGuard guard(builder); + builder.restoreIP(allocaIP); + for (unsigned i = 0; i < numReductions; ++i) { + llvm::Value *var = builder.CreateAlloca( + moduleTranslation.convertType(reductionDecls[i].getType())); + privateReductionVariables.push_back(var); + reductionVariableMap.try_emplace(loop.getReductionVars()[i], var); + } + } +} + +/// Collect reduction info +template +static void collectReductionInfo( + T loop, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation, + SmallVector &reductionDecls, + SmallVector &owningReductionGens, + SmallVector &owningAtomicReductionGens, + const SmallVector &privateReductionVariables, + SmallVector &reductionInfos) { + unsigned numReductions = loop.getNumReductionVars(); + + for (unsigned i = 0; i < numReductions; ++i) { + owningReductionGens.push_back( + makeReductionGen(reductionDecls[i], builder, moduleTranslation)); + owningAtomicReductionGens.push_back( + makeAtomicReductionGen(reductionDecls[i], builder, moduleTranslation)); + } + + // Collect the reduction information. + reductionInfos.reserve(numReductions); + for (unsigned i = 0; i < numReductions; ++i) { + llvm::OpenMPIRBuilder::AtomicReductionGenTy atomicGen = nullptr; + if (owningAtomicReductionGens[i]) + atomicGen = owningAtomicReductionGens[i]; + llvm::Value *variable = + moduleTranslation.lookupValue(loop.getReductionVars()[i]); + reductionInfos.push_back( + {moduleTranslation.convertType(reductionDecls[i].getType()), variable, + privateReductionVariables[i], owningReductionGens[i], atomicGen}); + } +} + /// Converts an OpenMP workshare loop into LLVM IR using OpenMPIRBuilder. static LogicalResult convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder, @@ -788,21 +819,10 @@ llvm::OpenMPIRBuilder::InsertPointTy allocaIP = findAllocaInsertPoint(builder, moduleTranslation); - // Allocate space for privatized reduction variables. SmallVector privateReductionVariables; DenseMap reductionVariableMap; - unsigned numReductions = loop.getNumReductionVars(); - privateReductionVariables.reserve(numReductions); - if (numReductions != 0) { - llvm::IRBuilderBase::InsertPointGuard guard(builder); - builder.restoreIP(allocaIP); - for (unsigned i = 0; i < numReductions; ++i) { - llvm::Value *var = builder.CreateAlloca( - moduleTranslation.convertType(reductionDecls[i].getType())); - privateReductionVariables.push_back(var); - reductionVariableMap.try_emplace(loop.getReductionVars()[i], var); - } - } + allocReductionVars(loop, builder, moduleTranslation, allocaIP, reductionDecls, + privateReductionVariables, reductionVariableMap); // Store the mapping between reduction variables and their private copies on // ModuleTranslation stack. It can be then recovered when translating @@ -813,7 +833,7 @@ // Before the loop, store the initial values of reductions into reduction // variables. Although this could be done after allocas, we don't want to mess // up with the alloca insertion point. - for (unsigned i = 0; i < numReductions; ++i) { + for (unsigned i = 0; i < loop.getNumReductionVars(); ++i) { SmallVector phis; if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(), "omp.reduction.neutral", builder, @@ -908,33 +928,17 @@ builder.restoreIP(afterIP); // Process the reductions if required. - if (numReductions == 0) + if (loop.getNumReductionVars() == 0) return success(); // Create the reduction generators. We need to own them here because // ReductionInfo only accepts references to the generators. SmallVector owningReductionGens; SmallVector owningAtomicReductionGens; - for (unsigned i = 0; i < numReductions; ++i) { - owningReductionGens.push_back( - makeReductionGen(reductionDecls[i], builder, moduleTranslation)); - owningAtomicReductionGens.push_back( - makeAtomicReductionGen(reductionDecls[i], builder, moduleTranslation)); - } - - // Collect the reduction information. SmallVector reductionInfos; - reductionInfos.reserve(numReductions); - for (unsigned i = 0; i < numReductions; ++i) { - llvm::OpenMPIRBuilder::AtomicReductionGenTy atomicGen = nullptr; - if (owningAtomicReductionGens[i]) - atomicGen = owningAtomicReductionGens[i]; - llvm::Value *variable = - moduleTranslation.lookupValue(loop.getReductionVars()[i]); - reductionInfos.push_back( - {moduleTranslation.convertType(reductionDecls[i].getType()), variable, - privateReductionVariables[i], owningReductionGens[i], atomicGen}); - } + collectReductionInfo(loop, builder, moduleTranslation, reductionDecls, + owningReductionGens, owningAtomicReductionGens, + privateReductionVariables, reductionInfos); // The call to createReductions below expects the block to have a // terminator. Create an unreachable instruction to serve as terminator @@ -954,6 +958,128 @@ return success(); } +/// Converts the OpenMP parallel operation to LLVM IR. +static LogicalResult +convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; + // TODO: support error propagation in OpenMPIRBuilder and use it instead of + // relying on captured variables. + LogicalResult bodyGenStatus = success(); + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + + auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) { + // Collect reduction declarations + SmallVector reductionDecls; + collectReductionDecls(opInst, reductionDecls); + + // Allocate reduction vars + SmallVector privateReductionVariables; + DenseMap reductionVariableMap; + allocReductionVars(opInst, builder, moduleTranslation, allocaIP, + reductionDecls, privateReductionVariables, + reductionVariableMap); + + // Store the mapping between reduction variables and their private copies on + // ModuleTranslation stack. It can be then recovered when translating + // omp.reduce operations in a separate call. + LLVM::ModuleTranslation::SaveStack mappingGuard( + moduleTranslation, reductionVariableMap); + + // Initialize reduction vars + builder.restoreIP(allocaIP); + for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) { + SmallVector phis; + if (failed(inlineConvertOmpRegions( + reductionDecls[i].getInitializerRegion(), "omp.reduction.neutral", + builder, moduleTranslation, &phis))) + bodyGenStatus = failure(); + assert(phis.size() == 1 && + "expected one value to be yielded from the " + "reduction neutral element declaration region"); + builder.restoreIP(allocaIP); + builder.CreateStore(phis[0], privateReductionVariables[i]); + } + + // Save the alloca insertion point on ModuleTranslation stack for use in + // nested regions. + LLVM::ModuleTranslation::SaveStack frame( + moduleTranslation, allocaIP); + + // ParallelOp has only one region associated with it. + builder.restoreIP(codeGenIP); + auto regionBlock = + convertOmpOpRegions(opInst.getRegion(), "omp.par.region", builder, + moduleTranslation, bodyGenStatus); + + // Process the reductions if required. + if (opInst.getNumReductionVars() > 0) { + // Collect reduction info + SmallVector owningReductionGens; + SmallVector owningAtomicReductionGens; + SmallVector reductionInfos; + collectReductionInfo(opInst, builder, moduleTranslation, reductionDecls, + owningReductionGens, owningAtomicReductionGens, + privateReductionVariables, reductionInfos); + + // Move to region cont block + builder.SetInsertPoint(regionBlock->getTerminator()); + + // Generate reductions from info + llvm::UnreachableInst *tempTerminator = builder.CreateUnreachable(); + builder.SetInsertPoint(tempTerminator); + + llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint = + ompBuilder->createReductions(builder.saveIP(), allocaIP, + reductionInfos, false); + if (!contInsertPoint.getBlock()) { + bodyGenStatus = opInst->emitOpError() << "failed to convert reductions"; + return; + } + + tempTerminator->eraseFromParent(); + builder.restoreIP(contInsertPoint); + } + }; + + // TODO: Perform appropriate actions according to the data-sharing + // attribute (shared, private, firstprivate, ...) of variables. + // Currently defaults to shared. + auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP, + llvm::Value &, llvm::Value &vPtr, + llvm::Value *&replacementValue) -> InsertPointTy { + replacementValue = &vPtr; + + return codeGenIP; + }; + + // TODO: Perform finalization actions for variables. This has to be + // called for variables which have destructors/finalizers. + auto finiCB = [&](InsertPointTy codeGenIP) {}; + + llvm::Value *ifCond = nullptr; + if (auto ifExprVar = opInst.getIfExprVar()) + ifCond = moduleTranslation.lookupValue(ifExprVar); + llvm::Value *numThreads = nullptr; + if (auto numThreadsVar = opInst.getNumThreadsVar()) + numThreads = moduleTranslation.lookupValue(numThreadsVar); + auto pbKind = llvm::omp::OMP_PROC_BIND_default; + if (auto bind = opInst.getProcBindVal()) + pbKind = getProcBindKind(*bind); + // TODO: Is the Parallel construct cancellable? + bool isCancellable = false; + + llvm::OpenMPIRBuilder::InsertPointTy allocaIP = + findAllocaInsertPoint(builder, moduleTranslation); + llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); + + builder.restoreIP( + ompBuilder->createParallel(ompLoc, allocaIP, bodyGenCB, privCB, finiCB, + ifCond, numThreads, pbKind, isCancellable)); + + return bodyGenStatus; +} + /// Converts an OpenMP simd loop into LLVM IR using OpenMPIRBuilder. static LogicalResult convertOmpSimdLoop(Operation &opInst, llvm::IRBuilderBase &builder, @@ -1286,15 +1412,20 @@ /// Converts an OpenMP reduction operation using OpenMPIRBuilder. Expects the /// mapping between reduction variables and their private equivalents to have /// been stored on the ModuleTranslation stack. Currently only supports -/// reduction within WsLoopOp, but can be easily extended. +/// reduction within WsLoopOp and ParallelOp, but can be easily extended. static LogicalResult convertOmpReductionOp(omp::ReductionOp reductionOp, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { // Find the declaration that corresponds to the reduction op. - auto reductionContainer = reductionOp->getParentOfType(); - omp::ReductionDeclareOp declaration = - findReductionDecl(reductionContainer, reductionOp); + omp::ReductionDeclareOp declaration; + Operation *reductionParent = reductionOp->getParentOp(); + if (dyn_cast(reductionParent) || + dyn_cast(reductionParent)) { + declaration = findReductionDecl(*reductionParent, reductionOp); + } else { + llvm_unreachable("Unhandled reduction container"); + } assert(declaration && "could not find reduction declaration"); // Retrieve the mapping between reduction variables and their private @@ -1302,11 +1433,13 @@ const DenseMap *reductionVariableMap = nullptr; moduleTranslation.stackWalk( [&](const OpenMPVarMappingStackFrame &frame) { - reductionVariableMap = &frame.mapping; - return WalkResult::interrupt(); + if (frame.mapping.contains(reductionOp.getAccumulator())) { + reductionVariableMap = &frame.mapping; + return WalkResult::interrupt(); + } + return WalkResult::advance(); }); assert(reductionVariableMap && "couldn't find private reduction variables"); - // Translate the reduction operation by emitting the body of the corresponding // reduction declaration. Region &reductionRegion = declaration.getReductionRegion(); diff --git a/mlir/test/Target/LLVMIR/openmp-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-reduction.mlir --- a/mlir/test/Target/LLVMIR/openmp-reduction.mlir +++ b/mlir/test/Target/LLVMIR/openmp-reduction.mlir @@ -416,3 +416,146 @@ // CHECK: define internal void @[[REDFUNC]] // CHECK: fadd float // CHECK: fmul float + +// ----- + +omp.reduction.declare @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr -> f32 + llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32 + omp.yield +} + +// CHECK-LABEL: @simple_reduction_parallel +llvm.func @simple_reduction_parallel() { + %c1 = llvm.mlir.constant(1 : i32) : i32 + %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr + omp.parallel reduction(@add_f32 -> %0 : !llvm.ptr) { + %1 = llvm.mlir.constant(2.0 : f32) : f32 + omp.reduction %1, %0 : f32, !llvm.ptr + omp.terminator + } + llvm.return +} + +// Call to the outlined function. +// CHECK: call void {{.*}} @__kmpc_fork_call +// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Outlined function. +// CHECK: define internal void @[[OUTLINED]] + +// Private reduction variable and its initialization. +// CHECK: %[[PRIVATE:.+]] = alloca float +// CHECK: store float 0.000000e+00, ptr %[[PRIVATE]] + +// Update of the private variable +// CHECK: %[[PARTIAL:.+]] = load float, ptr %[[PRIVATE]] +// CHECK: %[[UPDATED:.+]] = fadd float %[[PARTIAL]], 2.000000e+00 +// CHECK: store float %[[UPDATED]], ptr %[[PRIVATE]] + +// Call to the reduction function. +// CHECK: call i32 @__kmpc_reduce +// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Atomic reduction. +// CHECK: %[[PARTIAL:.+]] = load float, ptr %[[PRIVATE]] +// CHECK: atomicrmw fadd ptr %{{.*}}, float %[[PARTIAL]] + +// Non-atomic reduction: +// CHECK: fadd float +// CHECK: call void @__kmpc_end_reduce +// CHECK: br label %[[FINALIZE:.+]] + +// CHECK: [[FINALIZE]]: + +// Reduction function. +// CHECK: define internal void @[[REDFUNC]] +// CHECK: fadd float + +// ----- + +omp.reduction.declare @add_i32 : i32 +init { +^bb0(%arg: i32): + %0 = llvm.mlir.constant(0 : i32) : i32 + omp.yield (%0 : i32) +} +combiner { +^bb1(%arg0: i32, %arg1: i32): + %1 = llvm.add %arg0, %arg1 : i32 + omp.yield (%1 : i32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr -> i32 + llvm.atomicrmw add %arg2, %2 monotonic : !llvm.ptr, i32 + omp.yield +} + +// CHECK-LABEL: @parallel_nested_workshare_reduction +llvm.func @parallel_nested_workshare_reduction(%ub : i64) { + %c1 = llvm.mlir.constant(1 : i32) : i32 + %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr + + %lb = llvm.mlir.constant(1 : i64) : i64 + %step = llvm.mlir.constant(1 : i64) : i64 + + omp.parallel reduction(@add_i32 -> %0 : !llvm.ptr) { + omp.wsloop for (%iv) : i64 = (%lb) to (%ub) step (%step) { + %ival = llvm.trunc %iv : i64 to i32 + omp.reduction %ival, %0 : i32, !llvm.ptr + omp.yield + } + omp.terminator + } + + llvm.return +} + +// Call to the outlined function. +// CHECK: call void {{.*}} @__kmpc_fork_call +// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Outlined function. +// CHECK: define internal void @[[OUTLINED]] + +// Private reduction variable and its initialization. +// CHECK: %[[PRIVATE:[0-9]+]] = alloca i32 +// CHECK: store i32 0, ptr %[[PRIVATE]] + +// Loop exit: +// CHECK: call void @__kmpc_barrier + +// Call to the reduction function. +// CHECK: call i32 @__kmpc_reduce +// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Atomic reduction: +// CHECK: %[[PARTIAL:.+]] = load i32, ptr %[[PRIVATE]] +// CHECK: atomicrmw add ptr %{{.*}}, i32 %[[PARTIAL]] + +// Non-atomic reduction: +// CHECK: add i32 +// CHECK: call void @__kmpc_end_reduce + +// Update of the private variable using the reduction region +// (the body block currently comes after all the other blocks). +// CHECK: %[[PARTIAL:.+]] = load i32, ptr %[[PRIVATE]] +// CHECK: %[[UPDATED:.+]] = add i32 %[[PARTIAL]], {{.*}} +// CHECK: store i32 %[[UPDATED]], ptr %[[PRIVATE]] + +// Reduction function. +// CHECK: define internal void @[[REDFUNC]] +// CHECK: add i32