diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -390,6 +390,31 @@ InsertPointTy AllocaIP, bool NeedsBarrier); + /// Modifies the canonical loop to be a dynamically-scheduled workshare loop. + /// + /// This takes a \p LoopInfo representing a canonical loop, such as the one + /// created by \p createCanonicalLoop and emits additional instructions to + /// turn it into a dynamic workshare loop. In particular, it calls to an + /// OpenMP runtime function in the preheader to obtain, and then in each + /// iteration to update the loop counter. + /// + /// \param Loc The source location description, the insertion location + /// is not used. + /// \param CLI A descriptor of the canonical loop to workshare. + /// \param AllocaIP An insertion point for Alloca instructions usable in the + /// preheader of the loop. + /// \param NeedsBarrier Indicates whether a barrier must be insterted after + /// the loop. + /// \param Chunk The size of loop chunk considered as a unit when + /// scheduling. If \p nullptr, defaults to 1. + /// + /// \returns Point where to insert code after the loop. + InsertPointTy createDynamicWorkshareLoop(const LocationDescription &Loc, + CanonicalLoopInfo *CLI, + InsertPointTy AllocaIP, + bool NeedsBarrier, + Value *Chunk = nullptr); + /// Tile a loop nest. /// /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -1184,6 +1184,172 @@ return createStaticWorkshareLoop(Loc, CLI, AllocaIP, NeedsBarrier); } +/// Returns an LLVM function to call for initializing loop bounds using OpenMP +/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by +/// the runtime. Always interpret integers as unsigned similarly to +/// CanonicalLoopInfo. +static FunctionCallee +getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { + unsigned Bitwidth = Ty->getIntegerBitWidth(); + if (Bitwidth == 32) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u); + if (Bitwidth == 64) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u); + llvm_unreachable("unknown OpenMP loop iterator bitwidth"); +} + +/// Returns an LLVM function to call for updating the next loop using OpenMP +/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by +/// the runtime. Always interpret integers as unsigned similarly to +/// CanonicalLoopInfo. +static FunctionCallee +getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { + unsigned Bitwidth = Ty->getIntegerBitWidth(); + if (Bitwidth == 32) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u); + if (Bitwidth == 64) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u); + llvm_unreachable("unknown OpenMP loop iterator bitwidth"); +} + +OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createDynamicWorkshareLoop( + const LocationDescription &Loc, CanonicalLoopInfo *CLI, + InsertPointTy AllocaIP, bool NeedsBarrier, Value *Chunk) { + // Set up the source location value for OpenMP runtime. + if (!updateToLocation(Loc)) + llvm_unreachable("TODO: Needs to be fixed"); + + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); + Value *SrcLoc = getOrCreateIdent(SrcLocStr); + + // Declare useful OpenMP runtime functions. + Value *IV = CLI->getIndVar(); + Type *IVTy = IV->getType(); + FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this); + FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this); + + // Allocate space for computed loop bounds as expected by the "init" function. + Builder.restoreIP(AllocaIP); + Type *I32Type = Type::getInt32Ty(M.getContext()); + Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter"); + Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound"); + Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound"); + Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride"); + + // At the end of the preheader, prepare for calling the "init" function by + // storing the current loop bounds into the allocated space. A canonical loop + // always iterates from 0 to trip-count with step 1. Note that "init" expects + // and produces an inclusive upper bound. + auto *PreHeader = CLI->getPreheader(); + Builder.SetInsertPoint(PreHeader->getTerminator()); + Constant *Zero = ConstantInt::get(IVTy, 0); + Constant *One = ConstantInt::get(IVTy, 1); + Builder.CreateStore(Zero, PLowerBound); + /* Don't subtract one! + Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One); */ + Value *UpperBound = CLI->getTripCount(); + Builder.CreateStore(UpperBound, PUpperBound); + Builder.CreateStore(One, PStride); + Builder.CreateStore(One, PLowerBound); + + // TODO: Do we need to calculate the Step if Chunk is not set? + // Currently hard-coded to One! + if (!Chunk) + Chunk = One; + + Value *ThreadNum = getOrCreateThreadID(SrcLoc); + + // TODO: extract scheduling type and map it to OMP constant. This is curently + // happening in kmp.h and its ilk and needs to be moved to OpenMP.td first. + // Here we use @kmp_dynamic_chunked | @kmp_sch_modifier_nomonotonic + constexpr int DynamicSchedType = 35 | (1 << 30); + Constant *SchedulingType = ConstantInt::get(I32Type, DynamicSchedType); + + // Call the "init" function and update the trip count of the loop with the + // value it produced. + Builder.CreateCall(DynamicInit, + {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One, + UpperBound, /* step */ One, Chunk}); + + // An outer loop around the existing one. + auto *OuterCond = BasicBlock::Create(PreHeader->getContext(), + PreHeader->getName() + ".outer.cond", + PreHeader->getParent()); + // This needs to be 32-bit always, so can't use the IVType Zero above. + Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt()); + Value *Res = + Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter, + PLowerBound, PUpperBound, PStride}); + Constant *Zero32 = ConstantInt::get(I32Type, 0); + Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32); + Value *LowerBound = + Builder.CreateSub(Builder.CreateLoad(PLowerBound), One, "lb"); + Builder.CreateCondBr(MoreWork, CLI->getHeader(), CLI->getExit()); + + // Change PHI-node in loop header to use outer cond rather than preheader, + // and set IV to the LowerBound. + auto *Phi = &*CLI->getHeader()->begin(); + if (auto *PI = dyn_cast(Phi)) { + PI->setIncomingBlock(0, OuterCond); + PI->setIncomingValue(0, LowerBound); + } else + llvm_unreachable("Expected this to be a phi-node"); + + // Then set the pre-header to jump to the OuterCond + auto *Term = PreHeader->getTerminator(); + if (auto *Br = dyn_cast(Term)) { + Br->setSuccessor(0, OuterCond); + } + + // Update all uses of the induction variable except the one in the condition + // block that compares it with the actual upper bound, and the increment in + // the latch block. + // TODO: this can eventually move to CanonicalLoopInfo or to a new + // CanonicalLoopInfoUpdater interface. + Builder.SetInsertPoint(CLI->getBody(), CLI->getBody()->getFirstInsertionPt()); + Value *UpdatedIV = Builder.CreateAdd(IV, Zero); + IV->replaceUsesWithIf(UpdatedIV, [&](Use &U) { + auto *Instr = dyn_cast(U.getUser()); + return !Instr || + (Instr->getParent() != CLI->getCond() && + Instr->getParent() != CLI->getLatch() && Instr != UpdatedIV); + }); + + // Now re-jig the inner condition: + // * Use the UpperBound returned from the DynamicNext call. + // * jump to the loop outer loop when done with one of the inner loops. + Builder.SetInsertPoint(CLI->getCond(), CLI->getCond()->getFirstInsertionPt()); + UpperBound = Builder.CreateLoad(PUpperBound, "ub"); + auto *Comp = &*Builder.GetInsertPoint(); + if (auto *CI = dyn_cast(Comp)) { + CI->setOperand(1, UpperBound); + } + // Redirect the inner exit to branch to outer condition. + auto *Branch = &*CLI->getCond()->getTerminator()->getIterator(); + if (auto *BI = dyn_cast(Branch)) { + assert(BI->getSuccessor(1) == CLI->getExit()); + BI->setSuccessor(1, OuterCond); + } else + llvm_unreachable("Expected a branch instruction!"); + + // Add the barrier if requested. + if (NeedsBarrier) { + Builder.SetInsertPoint(CLI->getExit(), + CLI->getExit()->getTerminator()->getIterator()); + createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), + omp::Directive::OMPD_for, /* ForceSimpleCall */ false, + /* CheckCancelFlag */ false); + } + + // Don't return CLI - it is no longer a canonical loop. + // Just give back the insertion point after the loop. + return CLI->getAfterIP(); +} + /// Make \p Source branch to \p Target. /// /// Handles two situations: @@ -1865,7 +2031,7 @@ llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache}; Function *Fn = - getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached); + getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached); return Builder.CreateCall(Fn, Args); } diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -1706,6 +1706,80 @@ EXPECT_EQ(NumCallsInExitBlock, 3u); } +TEST_F(OpenMPIRBuilderTest, DynamicWorkShareLoop) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + IRBuilder<> Builder(BB); + OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); + + Type *LCTy = Type::getInt32Ty(Ctx); + Value *StartVal = ConstantInt::get(LCTy, 10); + Value *StopVal = ConstantInt::get(LCTy, 52); + Value *StepVal = ConstantInt::get(LCTy, 2); + auto LoopBodyGen = [&](InsertPointTy, llvm::Value *) {}; + + CanonicalLoopInfo *CLI = OMPBuilder.createCanonicalLoop( + Loc, LoopBodyGen, StartVal, StopVal, StepVal, + /*IsSigned=*/false, /*InclusiveStop=*/false); + + Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); + InsertPointTy AllocaIP = Builder.saveIP(); + + InsertPointTy EndIP = + OMPBuilder.createDynamicWorkshareLoop(Loc, CLI, AllocaIP, + /*NeedsBarrier=*/true); + // The returned value should be the "after" point. + ASSERT_EQ(EndIP.getBlock(), CLI->getAfterIP().getBlock()); + ASSERT_EQ(EndIP.getPoint(), CLI->getAfterIP().getPoint()); + + auto AllocaIter = BB->begin(); + ASSERT_GE(std::distance(BB->begin(), BB->end()), 4); + AllocaInst *PLastIter = dyn_cast(&*(AllocaIter++)); + AllocaInst *PLowerBound = dyn_cast(&*(AllocaIter++)); + AllocaInst *PUpperBound = dyn_cast(&*(AllocaIter++)); + AllocaInst *PStride = dyn_cast(&*(AllocaIter++)); + EXPECT_NE(PLastIter, nullptr); + EXPECT_NE(PLowerBound, nullptr); + EXPECT_NE(PUpperBound, nullptr); + EXPECT_NE(PStride, nullptr); + + auto PreheaderIter = CLI->getPreheader()->begin(); + ASSERT_GE( + std::distance(CLI->getPreheader()->begin(), CLI->getPreheader()->end()), + 7); + StoreInst *LowerBoundStore = dyn_cast(&*(PreheaderIter++)); + StoreInst *UpperBoundStore = dyn_cast(&*(PreheaderIter++)); + StoreInst *StrideStore = dyn_cast(&*(PreheaderIter++)); + ASSERT_NE(LowerBoundStore, nullptr); + ASSERT_NE(UpperBoundStore, nullptr); + ASSERT_NE(StrideStore, nullptr); + + auto *OrigLowerBound = + dyn_cast(LowerBoundStore->getValueOperand()); + auto *OrigUpperBound = + dyn_cast(UpperBoundStore->getValueOperand()); + auto *OrigStride = dyn_cast(StrideStore->getValueOperand()); + ASSERT_NE(OrigLowerBound, nullptr); + ASSERT_NE(OrigUpperBound, nullptr); + ASSERT_NE(OrigStride, nullptr); + EXPECT_EQ(OrigLowerBound->getValue(), 0); + EXPECT_EQ(OrigUpperBound->getValue(), 21); + EXPECT_EQ(OrigStride->getValue(), 1); + + // The original loop iterator should only be used in the condition, in the + // increment and in the statement that adds the lower bound to it. + Value *IV = CLI->getIndVar(); + EXPECT_EQ(std::distance(IV->use_begin(), IV->use_end()), 3); + + // The exit block should contain the barrier call, plus the call to obtain + // the thread ID. + BasicBlock *ExitBlock = CLI->getExit(); + size_t NumCallsInExitBlock = + count_if(*ExitBlock, [](Instruction &I) { return isa(I); }); + EXPECT_EQ(NumCallsInExitBlock, 2u); +} + TEST_F(OpenMPIRBuilderTest, MasterDirective) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M); diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -179,11 +179,17 @@ if (loop.getNumLoops() != 1) return opInst.emitOpError("collapsed loops not yet supported"); - if (loop.schedule_val().hasValue() && - omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue()) != - omp::ClauseScheduleKind::Static) - return opInst.emitOpError( - "only static (default) loop schedule is currently supported"); + bool isStatic = true; + + if (loop.schedule_val().hasValue()) { + auto schedule = + omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue()); + if (schedule != omp::ClauseScheduleKind::Static && + schedule != omp::ClauseScheduleKind::Dynamic) + return opInst.emitOpError("only static (default) and dynamic loop " + "schedule is currently supported"); + isStatic = (schedule == omp::ClauseScheduleKind::Static); + } // Find the loop configuration. llvm::Value *lowerBound = moduleTranslation.lookupValue(loop.lowerBound()[0]); @@ -241,11 +247,19 @@ // Put them at the start of the current block for now. llvm::OpenMPIRBuilder::InsertPointTy allocaIP( insertBlock, insertBlock->getFirstInsertionPt()); - loopInfo = moduleTranslation.getOpenMPBuilder()->createStaticWorkshareLoop( - ompLoc, loopInfo, allocaIP, !loop.nowait(), chunk); + llvm::OpenMPIRBuilder::InsertPointTy afterIP; + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + if (isStatic) { + loopInfo = ompBuilder->createStaticWorkshareLoop(ompLoc, loopInfo, allocaIP, + !loop.nowait(), chunk); + afterIP = loopInfo->getAfterIP(); + } else { + afterIP = ompBuilder->createDynamicWorkshareLoop(ompLoc, loopInfo, allocaIP, + !loop.nowait(), chunk); + } // Continue building IR after the loop. - builder.restoreIP(loopInfo->getAfterIP()); + builder.restoreIP(afterIP); return success(); }