diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -260,6 +260,32 @@ Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop); + /// Modifies the canonical loop to be a statically-scheduled workshare loop. + /// + /// This takes a \p LoopInfo representing a canonical loop, such as the one + /// created by \p createCanonicalLoop and emits additional instructions to + /// turn it into a workshare loop. In particular, it calls to an OpenMP + /// runtime function in the preheader to obtain the loop bounds to be used in + /// the current thread, updates the relevant instructions in the canonical + /// loop and calls to an OpenMP runtime finalization function after the loop. + /// + /// \param Loc The source location description, the insertion location + /// is not used. + /// \param CLI A descriptor of the canonical loop to workshare. + /// \param AllocaIP An insertion point for Alloca instructions usable in the + /// preheader of the loop. + /// \param NeedsBarrier Indicates whether a barrier must be insterted after + /// the loop. + /// \param Chunk The size of loop chunk considered as a unit when + /// scheduling. If \p nullptr, defaults to 1. + /// + /// \returns Updated CanonicalLoopInfo. + CanonicalLoopInfo *createStaticWorkshareLoop(const LocationDescription &Loc, + CanonicalLoopInfo *CLI, + InsertPointTy AllocaIP, + bool NeedsBarrier, + Value *Chunk = nullptr); + /// Generator for '#omp flush' /// /// \param Loc The location where the flush directive was encountered @@ -636,7 +662,9 @@ /// | Cond---\ /// | | | /// | Body | -/// | | | +/// | | | | +/// | <...> | +/// | | | | /// \--Latch | /// | /// Exit @@ -644,7 +672,9 @@ /// After /// /// Code in the header, condition block, latch and exit block must not have any -/// side-effect. +/// side-effect. The body block is the single entry point into the loop body, +/// which may contain arbitrary control flow as long as all control paths +/// eventually branch to the latch block. /// /// Defined outside OpenMPIRBuilder because one cannot forward-declare nested /// classes. @@ -701,7 +731,7 @@ /// statements/cancellations). BasicBlock *getAfter() const { return After; } - /// Returns the llvm::Value containing the number of loop iterations. I must + /// Returns the llvm::Value containing the number of loop iterations. It must /// be valid in the preheader and always interpreted as an unsigned integer of /// any bit-width. Value *getTripCount() const { diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -999,6 +999,118 @@ return createCanonicalLoop(Builder.saveIP(), BodyGen, TripCount); } +// Returns an LLVM function to call for initializing loop bounds using OpenMP +// static scheduling depending on `type`. Only i32 and i64 are supported by the +// runtime. Always interpret integers as unsigned similarly to +// CanonicalLoopInfo. +static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, + OpenMPIRBuilder &OMPBuilder) { + unsigned Bitwidth = Ty->getIntegerBitWidth(); + if (Bitwidth == 32) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u); + if (Bitwidth == 64) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u); + llvm_unreachable("unknown OpenMP loop iterator bitwidth"); +} + +// Sets the number of loop iterations to the given value. This value must be +// valid in the condition block (i.e., defined in the preheader) and is +// interpreted as an unsigned integer. +void setCanonicalLoopTripCount(CanonicalLoopInfo *CLI, Value *TripCount) { + Instruction *CmpI = &CLI->getCond()->front(); + assert(isa(CmpI) && "First inst must compare IV with TripCount"); + CmpI->setOperand(1, TripCount); + CLI->assertOK(); +} + +CanonicalLoopInfo *OpenMPIRBuilder::createStaticWorkshareLoop( + const LocationDescription &Loc, CanonicalLoopInfo *CLI, + InsertPointTy AllocaIP, bool NeedsBarrier, Value *Chunk) { + // Set up the source location value for OpenMP runtime. + if (!updateToLocation(Loc)) + return nullptr; + + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); + Value *SrcLoc = getOrCreateIdent(SrcLocStr); + + // Declare useful OpenMP runtime functions. + Value *IV = CLI->getIndVar(); + Type *IVTy = IV->getType(); + FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this); + FunctionCallee StaticFini = + getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini); + + // Allocate space for computed loop bounds as expected by the "init" function. + Builder.restoreIP(AllocaIP); + Type *I32Type = Type::getInt32Ty(M.getContext()); + Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter"); + Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound"); + Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound"); + Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride"); + + // At the end of the preheader, prepare for calling the "init" function by + // storing the current loop bounds into the allocated space. A canonical loop + // always iterates from 0 to trip-count with step 1. Note that "init" expects + // and produces an inclusive upper bound. + Builder.SetInsertPoint(CLI->getPreheader()->getTerminator()); + Constant *Zero = ConstantInt::get(IVTy, 0); + Constant *One = ConstantInt::get(IVTy, 1); + Builder.CreateStore(Zero, PLowerBound); + Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One); + Builder.CreateStore(UpperBound, PUpperBound); + Builder.CreateStore(One, PStride); + + if (!Chunk) + Chunk = One; + + Value *ThreadNum = getOrCreateThreadID(SrcLoc); + + // TODO: extract scheduling type and map it to OMP constant. This is curently + // happening in kmp.h and its ilk and needs to be moved to OpenMP.td first. + constexpr int StaticSchedType = 34; + Constant *SchedulingType = ConstantInt::get(I32Type, StaticSchedType); + + // Call the "init" function and update the trip count of the loop with the + // value it produced. + Builder.CreateCall(StaticInit, + {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, + PUpperBound, PStride, One, Chunk}); + Value *LowerBound = Builder.CreateLoad(PLowerBound); + Value *InclusiveUpperBound = Builder.CreateLoad(PUpperBound); + Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound); + Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One); + setCanonicalLoopTripCount(CLI, TripCount); + + // Update all uses of the induction variable except the one in the condition + // block that compares it with the actual upper bound, and the increment in + // the latch block. + // TODO: this can eventually move to CanonicalLoopInfo or to a new + // CanonicalLoopInfoUpdater interface. + Builder.SetInsertPoint(CLI->getBody(), CLI->getBody()->getFirstInsertionPt()); + Value *UpdatedIV = Builder.CreateAdd(IV, LowerBound); + IV->replaceUsesWithIf(UpdatedIV, [&](Use &U) { + auto *Instr = dyn_cast(U.getUser()); + return !Instr || + (Instr->getParent() != CLI->getCond() && + Instr->getParent() != CLI->getLatch() && Instr != UpdatedIV); + }); + + // In the "exit" block, call the "fini" function. + Builder.SetInsertPoint(CLI->getExit(), + CLI->getExit()->getTerminator()->getIterator()); + Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum}); + + // Add the barrier if requested. + if (NeedsBarrier) + createBarrier(Loc, omp::Directive::OMPD_for, /* ForceSimpleCall */ false, + /* CheckCancelFlag */ false); + + CLI->assertOK(); + return CLI; +} + void CanonicalLoopInfo::eraseFromParent() { assert(IsValid && "can only erase previously valid loop cfg"); IsValid = false; diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -1071,6 +1071,92 @@ EXPECT_FALSE(verifyModule(*M, &errs())); } +TEST_F(OpenMPIRBuilderTest, StaticWorkShareLoop) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + IRBuilder<> Builder(BB); + OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); + + Type *LCTy = Type::getInt32Ty(Ctx); + Value *StartVal = ConstantInt::get(LCTy, 10); + Value *StopVal = ConstantInt::get(LCTy, 52); + Value *StepVal = ConstantInt::get(LCTy, 2); + auto LoopBodyGen = [&](InsertPointTy, llvm::Value *) {}; + + CanonicalLoopInfo *CLI = OMPBuilder.createCanonicalLoop( + Loc, LoopBodyGen, StartVal, StopVal, StepVal, + /*IsSigned=*/false, /*InclusiveStop=*/false); + + Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); + InsertPointTy AllocaIP = Builder.saveIP(); + + CLI = OMPBuilder.createStaticWorkshareLoop(Loc, CLI, AllocaIP, + /*NeedsBarrier=*/true); + auto AllocaIter = BB->begin(); + ASSERT_GE(std::distance(BB->begin(), BB->end()), 4); + AllocaInst *PLastIter = dyn_cast(&*(AllocaIter++)); + AllocaInst *PLowerBound = dyn_cast(&*(AllocaIter++)); + AllocaInst *PUpperBound = dyn_cast(&*(AllocaIter++)); + AllocaInst *PStride = dyn_cast(&*(AllocaIter++)); + EXPECT_NE(PLastIter, nullptr); + EXPECT_NE(PLowerBound, nullptr); + EXPECT_NE(PUpperBound, nullptr); + EXPECT_NE(PStride, nullptr); + + auto PreheaderIter = CLI->getPreheader()->begin(); + ASSERT_GE( + std::distance(CLI->getPreheader()->begin(), CLI->getPreheader()->end()), + 7); + StoreInst *LowerBoundStore = dyn_cast(&*(PreheaderIter++)); + StoreInst *UpperBoundStore = dyn_cast(&*(PreheaderIter++)); + StoreInst *StrideStore = dyn_cast(&*(PreheaderIter++)); + ASSERT_NE(LowerBoundStore, nullptr); + ASSERT_NE(UpperBoundStore, nullptr); + ASSERT_NE(StrideStore, nullptr); + + auto *OrigLowerBound = + dyn_cast(LowerBoundStore->getValueOperand()); + auto *OrigUpperBound = + dyn_cast(UpperBoundStore->getValueOperand()); + auto *OrigStride = dyn_cast(StrideStore->getValueOperand()); + ASSERT_NE(OrigLowerBound, nullptr); + ASSERT_NE(OrigUpperBound, nullptr); + ASSERT_NE(OrigStride, nullptr); + EXPECT_EQ(OrigLowerBound->getValue(), 0); + EXPECT_EQ(OrigUpperBound->getValue(), 20); + EXPECT_EQ(OrigStride->getValue(), 1); + + // Check that the loop IV is updated to account for the lower bound returned + // by the OpenMP runtime call. + BinaryOperator *Add = dyn_cast(&CLI->getBody()->front()); + EXPECT_EQ(Add->getOperand(0), CLI->getIndVar()); + auto *LoadedLowerBound = dyn_cast(Add->getOperand(1)); + ASSERT_NE(LoadedLowerBound, nullptr); + EXPECT_EQ(LoadedLowerBound->getPointerOperand(), PLowerBound); + + // Check that the trip count is updated to account for the lower and upper + // bounds return by the OpenMP runtime call. + auto *AddOne = dyn_cast(CLI->getTripCount()); + ASSERT_NE(AddOne, nullptr); + ASSERT_TRUE(AddOne->isBinaryOp()); + auto *One = dyn_cast(AddOne->getOperand(1)); + ASSERT_NE(One, nullptr); + EXPECT_EQ(One->getValue(), 1); + auto *Difference = dyn_cast(AddOne->getOperand(0)); + ASSERT_NE(Difference, nullptr); + ASSERT_TRUE(Difference->isBinaryOp()); + EXPECT_EQ(Difference->getOperand(1), LoadedLowerBound); + auto *LoadedUpperBound = dyn_cast(Difference->getOperand(0)); + ASSERT_NE(LoadedUpperBound, nullptr); + EXPECT_EQ(LoadedUpperBound->getPointerOperand(), PUpperBound); + + // The original loop iterator should only be used in the condition, in the + // increment and in the statement that adds the lower bound to it. + Value *IV = CLI->getIndVar(); + EXPECT_EQ(std::distance(IV->use_begin(), IV->use_end()), 3); +} + TEST_F(OpenMPIRBuilderTest, MasterDirective) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M);