diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -260,6 +260,29 @@ Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop); + /// Modifies the canonical loop to be a statically-scheduled workshare loop. + /// + /// This takes a \p LoopInfo representing a canonical loop, such as the one + /// created by \p createCanonicalLoop and emits additional instructions to + /// turn it into a workshare loop. In particular, it calls to an OpenMP + /// runtime function in the preheader to obtain the loop bounds to be used in + /// the current thread, updates the relevant instructions in the canonical + /// loop and calls to an OpenMP runtime finalization function after the loop. + /// + /// \param Loc The source location description, the insertion location + /// is not used. + /// \param LoopInfo A descriptor of the canonical loop to workshare. + /// \param AllocaIP An insertion point for Alloca instructions usable in the + /// preheader of the loop. + /// \param Chunk The size of loop chunk considered as a unit when + /// scheduling. If \p nullptr, defaults to 1. + /// + /// \returns An insertion point after the finalization call. + InsertPointTy createStaticWorkshareLoop(const LocationDescription &Loc, + CanonicalLoopInfo *LoopInfo, + InsertPointTy AllocaIP, + Value *Chunk = nullptr); + /// Generator for '#omp flush' /// /// \param Loc The location where the flush directive was encountered @@ -636,7 +659,9 @@ /// | Cond---\ /// | | | /// | Body | -/// | | | +/// | | | | +/// | <...> | +/// | | | | /// \--Latch | /// | /// Exit @@ -644,7 +669,9 @@ /// After /// /// Code in the header, condition block, latch and exit block must not have any -/// side-effect. +/// side-effect. The body block is the single entry point into the loop body, +/// which may contain arbitrary control flow as long as all control paths +/// eventually branch to the latch block. /// /// Defined outside OpenMPIRBuilder because one cannot forward-declare nested /// classes. @@ -701,7 +728,7 @@ /// statements/cancellations). BasicBlock *getAfter() const { return After; } - /// Returns the llvm::Value containing the number of loop iterations. I must + /// Returns the llvm::Value containing the number of loop iterations. It must /// be valid in the preheader and always interpreted as an unsigned integer of /// any bit-width. Value *getTripCount() const { @@ -710,6 +737,15 @@ return CmpI->getOperand(1); } + /// Sets the number of loop iterations to the given value. This value must be + /// valid in the condition block (i.e., defined in the preheader) and is + /// interpreted as an unsigned integer. + void setTripCount(Value *TripCount) { + Instruction *CmpI = &Cond->front(); + assert(isa(CmpI) && "First inst must compare IV with TripCount"); + CmpI->setOperand(1, TripCount); + } + /// Returns the instruction representing the current logical induction /// variable. Always unsigned, always starting at 0 with an increment of one. Instruction *getIndVar() const { diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -987,6 +987,102 @@ return createCanonicalLoop(Builder.saveIP(), BodyGen, TripCount); } +// Returns an LLVM function to call for initializing loop bounds using OpenMP +// static scheduling depending on `type`. Only i32 and i64 are supported by the +// runtime. Always interpret integers as unsigned similarly to +// CanonicalLoopInfo. +static FunctionCallee getKmpcForStaticInitForType(Type *type, Module &M, + OpenMPIRBuilder &OMPBuilder) { + unsigned Bitwidth = type->getIntegerBitWidth(); + if (Bitwidth == 32) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u); + if (Bitwidth == 64) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u); + llvm_unreachable("unknown OpenMP loop iterator bitwidth"); +} + +OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createStaticWorkshareLoop( + const LocationDescription &Loc, CanonicalLoopInfo *LoopInfo, + InsertPointTy AllocaIP, Value *Chunk) { + // Set up the source location value for OpenMP runtime. + updateToLocation(Loc); + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); + Value *SrcLoc = getOrCreateIdent(SrcLocStr); + + // Declare useful OpenMP runtime functions. + Value *IV = LoopInfo->getIndVar(); + Type *IVType = IV->getType(); + FunctionCallee StaticInit = getKmpcForStaticInitForType(IVType, M, *this); + FunctionCallee StaticFini = + getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini); + + // Allocate space for computed loop bounds as expected by the "init" function. + Builder.restoreIP(AllocaIP); + Type *I32Type = Type::getInt32Ty(M.getContext()); + Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter"); + Value *PLowerBound = Builder.CreateAlloca(IVType); + Value *PUpperBound = Builder.CreateAlloca(IVType); + Value *PStride = Builder.CreateAlloca(IVType); + + // At the end of the preheader, prepare for calling the "init" function by + // storing the current loop bounds into the allocated space. A canonical loop + // always iterates from 0 to trip-count with step 1. Note that "init" expects + // and produces an inclusive upper bound. + Builder.SetInsertPoint(LoopInfo->getPreheader()->getTerminator()); + Constant *Zero = ConstantInt::get(IVType, 0); + Constant *One = ConstantInt::get(IVType, 1); + Builder.CreateStore(Zero, PLowerBound); + Value *UpperBound = Builder.CreateSub(LoopInfo->getTripCount(), One); + Builder.CreateStore(UpperBound, PUpperBound); + Builder.CreateStore(One, PStride); + + if (!Chunk) + Chunk = One; + + Value *ThreadNum = getOrCreateThreadID(SrcLoc); + + // TODO: extract scheduling type and map it to OMP constant. This is curently + // happening in kmp.h and its ilk and needs to be moved to OpenMP.td first. + constexpr int StaticSchedType = 34; + Constant *SchedulingType = ConstantInt::get(I32Type, StaticSchedType); + + // Call the "init" function and update the trip count of the loop with the + // value it produced. + Builder.CreateCall(StaticInit, + {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, + PUpperBound, PStride, One, Chunk}); + Value *LowerBound = Builder.CreateLoad(PLowerBound); + Value *InclusiveUpperBound = Builder.CreateLoad(PUpperBound); + Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound); + Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One); + LoopInfo->setTripCount(TripCount); + + // Update all uses of the induction variable except the one in the condition + // block that compares it with the actual upper bound, and the increment in + // the latch block. + Builder.SetInsertPoint(LoopInfo->getBody(), + LoopInfo->getBody()->getFirstInsertionPt()); + Value *UpdatedIV = Builder.CreateAdd(IV, LowerBound); + SmallVector Uses; + for (Use &U : IV->uses()) + Uses.push_back(&U); + for (Use *U : Uses) { + if (auto *Instr = dyn_cast(U->getUser())) + if (Instr->getParent() == LoopInfo->getCond() || + Instr->getParent() == LoopInfo->getLatch() || Instr == UpdatedIV) + continue; + U->set(UpdatedIV); + } + + // After the loop, call the "fini" function. + Builder.restoreIP(LoopInfo->getAfterIP()); + Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum}); + + return Builder.saveIP(); +} + void CanonicalLoopInfo::eraseFromParent() { assert(IsValid && "can only erase previously valid loop cfg"); IsValid = false; diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -1071,6 +1071,91 @@ EXPECT_FALSE(verifyModule(*M, &errs())); } +TEST_F(OpenMPIRBuilderTest, StaticWorkShareLoop) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + IRBuilder<> Builder(BB); + OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); + + Type *LCTy = Type::getInt32Ty(Ctx); + Value *StartVal = ConstantInt::get(LCTy, 10); + Value *StopVal = ConstantInt::get(LCTy, 52); + Value *StepVal = ConstantInt::get(LCTy, 2); + auto LoopBodyGen = [&](InsertPointTy, llvm::Value *) {}; + + CanonicalLoopInfo *LoopInfo = OMPBuilder.createCanonicalLoop( + Loc, LoopBodyGen, StartVal, StopVal, StepVal, + /*IsSigned=*/false, /*InclusiveStop=*/false); + + Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); + InsertPointTy AllocaIP = Builder.saveIP(); + + OMPBuilder.createStaticWorkshareLoop(Loc, LoopInfo, AllocaIP); + auto AllocaIter = BB->begin(); + ASSERT_GE(std::distance(BB->begin(), BB->end()), 4); + AllocaInst *PLastIter = dyn_cast(&*(AllocaIter++)); + AllocaInst *PLowerBound = dyn_cast(&*(AllocaIter++)); + AllocaInst *PUpperBound = dyn_cast(&*(AllocaIter++)); + AllocaInst *PStride = dyn_cast(&*(AllocaIter++)); + EXPECT_NE(PLastIter, nullptr); + EXPECT_NE(PLowerBound, nullptr); + EXPECT_NE(PUpperBound, nullptr); + EXPECT_NE(PStride, nullptr); + + auto PreheaderIter = LoopInfo->getPreheader()->begin(); + ASSERT_GE(std::distance(LoopInfo->getPreheader()->begin(), + LoopInfo->getPreheader()->end()), + 7); + StoreInst *LowerBoundStore = dyn_cast(&*(PreheaderIter++)); + StoreInst *UpperBoundStore = dyn_cast(&*(PreheaderIter++)); + StoreInst *StrideStore = dyn_cast(&*(PreheaderIter++)); + ASSERT_NE(LowerBoundStore, nullptr); + ASSERT_NE(UpperBoundStore, nullptr); + ASSERT_NE(StrideStore, nullptr); + + auto *OrigLowerBound = + dyn_cast(LowerBoundStore->getValueOperand()); + auto *OrigUpperBound = + dyn_cast(UpperBoundStore->getValueOperand()); + auto *OrigStride = dyn_cast(StrideStore->getValueOperand()); + ASSERT_NE(OrigLowerBound, nullptr); + ASSERT_NE(OrigUpperBound, nullptr); + ASSERT_NE(OrigStride, nullptr); + EXPECT_EQ(OrigLowerBound->getValue(), 0); + EXPECT_EQ(OrigUpperBound->getValue(), 20); + EXPECT_EQ(OrigStride->getValue(), 1); + + // Check that the loop IV is updated to account for the lower bound returned + // by the OpenMP runtime call. + BinaryOperator *Add = dyn_cast(&LoopInfo->getBody()->front()); + EXPECT_EQ(Add->getOperand(0), LoopInfo->getIndVar()); + auto *LoadedLowerBound = dyn_cast(Add->getOperand(1)); + ASSERT_NE(LoadedLowerBound, nullptr); + EXPECT_EQ(LoadedLowerBound->getPointerOperand(), PLowerBound); + + // Check that the trip count is updated to account for the lower and upper + // bounds return by the OpenMP runtime call. + auto *AddOne = dyn_cast(LoopInfo->getTripCount()); + ASSERT_NE(AddOne, nullptr); + ASSERT_TRUE(AddOne->isBinaryOp()); + auto *One = dyn_cast(AddOne->getOperand(1)); + ASSERT_NE(One, nullptr); + EXPECT_EQ(One->getValue(), 1); + auto *Difference = dyn_cast(AddOne->getOperand(0)); + ASSERT_NE(Difference, nullptr); + ASSERT_TRUE(Difference->isBinaryOp()); + EXPECT_EQ(Difference->getOperand(1), LoadedLowerBound); + auto *LoadedUpperBound = dyn_cast(Difference->getOperand(0)); + ASSERT_NE(LoadedUpperBound, nullptr); + EXPECT_EQ(LoadedUpperBound->getPointerOperand(), PUpperBound); + + // The original loop iterator should only be used in the condition, in the + // increment and in the statement that adds the lower bound to it. + Value *IV = LoopInfo->getIndVar(); + EXPECT_EQ(std::distance(IV->use_begin(), IV->use_end()), 3); +} + TEST_F(OpenMPIRBuilderTest, MasterDirective) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M);