diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -364,6 +364,31 @@ bool NeedsBarrier, Value *Chunk = nullptr); + /// Modifies the canonical loop to be a dynamically-scheduled workshare loop. + /// + /// This takes a \p LoopInfo representing a canonical loop, such as the one + /// created by \p createCanonicalLoop and emits additional instructions to + /// turn it into a workshare loop. In particular, it calls to an OpenMP + /// runtime function in the preheader to obtain, and then in each iteration + /// to update the loop counter. + /// + /// \param Loc The source location description, the insertion location + /// is not used. + /// \param CLI A descriptor of the canonical loop to workshare. + /// \param AllocaIP An insertion point for Alloca instructions usable in the + /// preheader of the loop. + /// \param NeedsBarrier Indicates whether a barrier must be insterted after + /// the loop. + /// \param Chunk The size of loop chunk considered as a unit when + /// scheduling. If \p nullptr, defaults to 1. + /// + /// \returns Updated CanonicalLoopInfo. + CanonicalLoopInfo *createDynamicWorkshareLoop(const LocationDescription &Loc, + CanonicalLoopInfo *CLI, + InsertPointTy AllocaIP, + bool NeedsBarrier, + Value *Chunk = nullptr); + /// Tile a loop nest. /// /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -1164,6 +1164,143 @@ return CLI; } +// Returns an LLVM function to call for initializing loop bounds using OpenMP +// dynamic scheduling depending on `type`. Only i32 and i64 are supported by the +// runtime. Always interpret integers as unsigned similarly to +// CanonicalLoopInfo. +static FunctionCallee +getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { + unsigned Bitwidth = Ty->getIntegerBitWidth(); + if (Bitwidth == 32) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u); + if (Bitwidth == 64) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u); + llvm_unreachable("unknown OpenMP loop iterator bitwidth"); +} + +// Returns an LLVM function to call for updating the next loop using OpenMP +// dynamic scheduling depending on `type`. Only i32 and i64 are supported by the +// runtime. Always interpret integers as unsigned similarly to +// CanonicalLoopInfo. +static FunctionCallee +getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { + unsigned Bitwidth = Ty->getIntegerBitWidth(); + if (Bitwidth == 32) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u); + if (Bitwidth == 64) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u); + llvm_unreachable("unknown OpenMP loop iterator bitwidth"); +} + +CanonicalLoopInfo *OpenMPIRBuilder::createDynamicWorkshareLoop( + const LocationDescription &Loc, CanonicalLoopInfo *CLI, + InsertPointTy AllocaIP, bool NeedsBarrier, Value *Chunk) { + // Set up the source location value for OpenMP runtime. + if (!updateToLocation(Loc)) + return nullptr; + + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); + Value *SrcLoc = getOrCreateIdent(SrcLocStr); + + // Declare useful OpenMP runtime functions. + Value *IV = CLI->getIndVar(); + Type *IVTy = IV->getType(); + FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this); + FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this); + + // Allocate space for computed loop bounds as expected by the "init" function. + Builder.restoreIP(AllocaIP); + Type *I32Type = Type::getInt32Ty(M.getContext()); + Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter"); + Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound"); + Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound"); + Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride"); + + // At the end of the preheader, prepare for calling the "init" function by + // storing the current loop bounds into the allocated space. A canonical loop + // always iterates from 0 to trip-count with step 1. Note that "init" expects + // and produces an inclusive upper bound. + Builder.SetInsertPoint(CLI->getPreheader()->getTerminator()); + Constant *Zero = ConstantInt::get(IVTy, 0); + Constant *One = ConstantInt::get(IVTy, 1); + Builder.CreateStore(Zero, PLowerBound); + Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One); + Builder.CreateStore(UpperBound, PUpperBound); + Builder.CreateStore(One, PStride); + Builder.CreateStore(One, PLowerBound); + + if (!Chunk) + Chunk = One; + + Value *ThreadNum = getOrCreateThreadID(SrcLoc); + + // TODO: extract scheduling type and map it to OMP constant. This is curently + // happening in kmp.h and its ilk and needs to be moved to OpenMP.td first. + // Here we use @kmp_dynamic_chunked | @kmp_sch_modifier_nomonotonic + constexpr int DynamicSchedType = 35 | (1 << 30); + Constant *SchedulingType = ConstantInt::get(I32Type, DynamicSchedType); + + // Call the "init" function and update the trip count of the loop with the + // value it produced. + Builder.CreateCall(DynamicInit, + {SrcLoc, ThreadNum, SchedulingType, Zero /* LastIter */, + One /* LowerBound */, UpperBound, One}); + Value *LowerBound = Builder.CreateLoad(PLowerBound); + Value *InclusiveUpperBound = Builder.CreateLoad(PUpperBound); + Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound); + Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One); + setCanonicalLoopTripCount(CLI, TripCount); + + // Update all uses of the induction variable except the one in the condition + // block that compares it with the actual upper bound, and the increment in + // the latch block. + // TODO: this can eventually move to CanonicalLoopInfo or to a new + // CanonicalLoopInfoUpdater interface. + Builder.SetInsertPoint(CLI->getBody(), CLI->getBody()->getFirstInsertionPt()); + Value *UpdatedIV = Builder.CreateAdd(IV, LowerBound); + IV->replaceUsesWithIf(UpdatedIV, [&](Use &U) { + auto *Instr = dyn_cast(U.getUser()); + return !Instr || + (Instr->getParent() != CLI->getCond() && + Instr->getParent() != CLI->getLatch() && Instr != UpdatedIV); + }); + + Builder.SetInsertPoint(CLI->getCond(), CLI->getCond()->getFirstInsertionPt()); + Value *Res = + Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter, + PLowerBound, PUpperBound, PStride}); + Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero); + BranchInst *BI = + Builder.CreateCondBr(MoreWork, CLI->getBody(), CLI->getExit()); + + // Remove the "old" condition code. + + for (BasicBlock::iterator It = CLI->getCond()->end();;) { + It--; + if (It == BasicBlock::iterator(BI)) + break; + Instruction *Inst = &*It; + It = Inst->eraseFromParent(); + } + + // In the "exit" block + Builder.SetInsertPoint(CLI->getExit(), + CLI->getExit()->getTerminator()->getIterator()); + + // Add the barrier if requested. + if (NeedsBarrier) + createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), + omp::Directive::OMPD_for, /* ForceSimpleCall */ false, + /* CheckCancelFlag */ false); + + // CLI->assertOK(); + return CLI; +} + /// Make \p Source branch to \p Target. /// /// Handles two situations: diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -178,11 +178,17 @@ if (loop.getNumLoops() != 1) return opInst.emitOpError("collapsed loops not yet supported"); - if (loop.schedule_val().hasValue() && - omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue()) != - omp::ClauseScheduleKind::Static) - return opInst.emitOpError( - "only static (default) loop schedule is currently supported"); + bool isStatic = true; + + if (loop.schedule_val().hasValue()) { + auto schedule = + omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue()); + if (schedule != omp::ClauseScheduleKind::Static && + schedule != omp::ClauseScheduleKind::Dynamic) + return opInst.emitOpError("only static (default) and dynamic loop " + "schedule is currently supported"); + isStatic = (schedule == omp::ClauseScheduleKind::Static); + } // Find the loop configuration. llvm::Value *lowerBound = moduleTranslation.lookupValue(loop.lowerBound()[0]); @@ -227,10 +233,10 @@ // i.e. it has a positive step, uses signed integer semantics. Reconsider // this code when WsLoop clearly supports more cases. llvm::BasicBlock *insertBlock = builder.GetInsertBlock(); - llvm::CanonicalLoopInfo *loopInfo = - moduleTranslation.getOpenMPBuilder()->createCanonicalLoop( - ompLoc, bodyGen, lowerBound, upperBound, step, /*IsSigned=*/true, - /*InclusiveStop=*/loop.inclusive()); + auto ompBuilder = moduleTranslation.getOpenMPBuilder(); + llvm::CanonicalLoopInfo *loopInfo = ompBuilder->createCanonicalLoop( + ompLoc, bodyGen, lowerBound, upperBound, step, /*IsSigned=*/true, + /*InclusiveStop=*/loop.inclusive()); if (failed(bodyGenStatus)) return failure(); @@ -240,8 +246,12 @@ // Put them at the start of the current block for now. llvm::OpenMPIRBuilder::InsertPointTy allocaIP( insertBlock, insertBlock->getFirstInsertionPt()); - loopInfo = moduleTranslation.getOpenMPBuilder()->createStaticWorkshareLoop( - ompLoc, loopInfo, allocaIP, !loop.nowait(), chunk); + if (isStatic) + loopInfo = ompBuilder->createStaticWorkshareLoop(ompLoc, loopInfo, allocaIP, + !loop.nowait(), chunk); + else + loopInfo = ompBuilder->createDynamicWorkshareLoop( + ompLoc, loopInfo, allocaIP, !loop.nowait(), chunk); // Continue building IR after the loop. builder.restoreIP(loopInfo->getAfterIP());