diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -271,6 +271,70 @@ InsertPointTy ComputeIP = {}, const Twine &Name = "loop"); + /// Collapse a loop nest into a single loop. + /// + /// Merges loops of a loop nest into a single CanonicalLoopNest representation + /// that has the same number of innermost loop iterations as the origin loop + /// nest. The induction variables of the input loops are derived from the + /// collapsed loop's induction variable. This is intended to be used to + /// implement OpenMP's collapse clause. Before applying a directive, + /// collapseLoops normalizes a loop nest to contain only a single loop and the + /// directive's implementation does not need to handle multiple loops itself. + /// This does not remove the need to handle all loop nest handling by + /// directives, such as the ordered() clause or the simd schedule-clause + /// modifier of the worksharing-loop directive. + /// + /// Example: + /// \code + /// for (int i = 0; i < 7; ++i) // Canonical loop "i" + /// for (int j = 0; j < 9; ++j) // Canonical loop "j" + /// body(i, j); + /// \endcode + /// + /// After collapsing with Loops={i,j}, the loop is changed to + /// \code + /// for (int ij = 0; ij < 63; ++ij) { + /// int i = ij / 9; + /// int j = ij % 9; + /// body(i, j); + /// } + /// \endcode + /// + /// In the current implementation, the following limitations apply: + /// + /// * All input loops have an induction variable of the same type. + /// + /// * The collapsed loop will have the same trip count integer type as the + /// input loops. Therefore it is possible that the collapsed loop cannot + /// represent all iterations of the input loops. For instance, assuming a + /// 32 bit integer type, and two input loops both iterating 2^16 times, the + /// theoretical trip count of the collapsed loop would be 2^32 iteration, + /// which cannot be represented in an 32-bit integer. Behavior is undefined + /// in this case. + /// + /// * The trip counts of every input loop must be available at \p ComputeIP. + /// Non-rectangular loops are not yet supported. + /// + /// * At each nest level, code between a surrounding loop and its nested loop + /// is hoisted into the loop body, and such code will be executed more + /// often than before collapsing (or not at all if any inner loop iteration + /// has a trip count of 0). This is permitted by the OpenMP specification. + /// + /// \param DL Debug location for instructions added for collapsing, + /// such as instructions to compute derive the input loop's + /// induction variables. + /// \param Loops Loops in the loop nest to collapse. Loops are specified + /// from outermost-to-innermost and every control flow of a + /// loop's body must pass through its directly nested loop. + /// \param ComputeIP Where additional instruction that compute the collapsed + /// trip count. If not set, defaults to before the generated + /// loop. + /// + /// \returns The CanonicalLoopInfo object representing the collapsed loop. + CanonicalLoopInfo *collapseLoops(DebugLoc DL, + ArrayRef Loops, + InsertPointTy ComputeIP); + /// Modifies the canonical loop to be a statically-scheduled workshare loop. /// /// This takes a \p LoopInfo representing a canonical loop, such as the one @@ -726,6 +790,12 @@ BasicBlock *Exit; BasicBlock *After; + /// Add the control blocks of this loop to \p BBs. + /// + /// This does not include any block from the body, including the one returned + /// by getBody(). + void collectControlBlocks(SmallVectorImpl &BBs); + public: /// The preheader ensures that there is only a single edge entering the loop. /// Code that must be execute before any loop iteration can be emitted here, @@ -778,6 +848,11 @@ return IndVarPHI; } + /// Return the insertion point for user code before the loop. + OpenMPIRBuilder::InsertPointTy getPreheaderIP() const { + return {Preheader, std::prev(Preheader->end())}; + }; + /// Return the insertion point for user code in the body. OpenMPIRBuilder::InsertPointTy getBodyIP() const { return {Body, Body->begin()}; diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -953,7 +953,8 @@ // Emit the body content. We do it after connecting the loop to the CFG to // avoid that the callback encounters degenerate BBs. - BodyGenCB(CL->getBodyIP(), CL->getIndVar()); + if (BodyGenCB) + BodyGenCB(CL->getBodyIP(), CL->getIndVar()); #ifndef NDEBUG CL->assertOK(); @@ -1146,6 +1147,200 @@ return CLI; } +/// Make \p Source branch to \p Target. +/// +/// Handles two situations: +/// * \p Source already has an unconditional branch. +/// * \p Source is a degenerate block. +static void redirectTo(BasicBlock *Source, BasicBlock *Target) { + if (Instruction *Term = Source->getTerminator()) { + auto *Br = cast(Term); + assert(!Br->isConditional()); + BasicBlock *Succ = Br->getSuccessor(0); + Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true); + Br->setSuccessor(0, Target); + return; + } + + BranchInst::Create(Target, Source); +} + +/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is, +/// after this \p OldTarget will be orphaned. +static void redirectAllPredecessorsTo(BasicBlock *OldTarget, + BasicBlock *NewTarget) { + for (auto Pred : make_early_inc_range(predecessors(OldTarget))) + redirectTo(Pred, NewTarget); +} + +/// Determine which blocks in \p BBs are reachable from outside and remove the +/// ones that are not reachable from the function. +static void removeUnusedBlocksFromParent(ArrayRef BBs) { + SmallPtrSet BBsToErase{BBs.begin(), BBs.end()}; + auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) { + for (Use &U : BB->uses()) { + auto UseInst = dyn_cast(U.getUser()); + if (!UseInst) + continue; + if (BBsToErase.count(UseInst->getParent())) + continue; + return true; + } + return false; + }; + + while (true) { + bool Changed = false; + for (BasicBlock *BB : make_early_inc_range(BBsToErase)) { + if (HasRemainingUses(BB)) { + BBsToErase.erase(BB); + Changed = true; + } + } + if (!Changed) + break; + } + + SmallVector BBVec(BBsToErase.begin(), BBsToErase.end()); + DeleteDeadBlocks(BBVec); +} + +CanonicalLoopInfo * +OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef Loops, + InsertPointTy ComputeIP) { + assert(Loops.size() >= 1 && "At least one loop required"); + size_t NumLoops = Loops.size(); + + // Nothing to do if there is already just one loop. + if (NumLoops == 1) + return Loops.front(); + + CanonicalLoopInfo *Outermost = Loops.front(); + CanonicalLoopInfo *Innermost = Loops.back(); + BasicBlock *OrigPreheader = Outermost->getPreheader(); + BasicBlock *OrigAfter = Outermost->getAfter(); + Function *F = OrigPreheader->getParent(); + + // Setup the IRBuilder for inserting the trip count computation. + Builder.SetCurrentDebugLocation(DL); + if (ComputeIP.isSet()) + Builder.restoreIP(ComputeIP); + else + Builder.restoreIP(Outermost->getPreheaderIP()); + + // Derive the collapsed' loop trip count. + // TODO: Find common/largest indvar type. + Value *CollapsedTripCount = nullptr; + for (CanonicalLoopInfo *L : Loops) { + Value *OrigTripCount = L->getTripCount(); + if (!CollapsedTripCount) { + CollapsedTripCount = OrigTripCount; + continue; + } + + CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount, + {}, /*HasNUW=*/true); + } + + // Create the collapsed loop control flow. + CanonicalLoopInfo *Result = + createLoopSkeleton(DL, CollapsedTripCount, F, + OrigPreheader->getNextNode(), OrigAfter, "collapsed"); + + // Build the collapsed loop body code. + // Start with deriving the input loop induction variables from the collapsed + // one, using a divmod scheme. To preserve the original loops' order, the + // innermost loop use the least significant bits. + Builder.restoreIP(Result->getBodyIP()); + + Value *Leftover = Result->getIndVar(); + SmallVector NewIndVars; + NewIndVars.set_size(NumLoops); + for (int i = NumLoops - 1; i >= 1; --i) { + Value *OrigTripCount = Loops[i]->getTripCount(); + + Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount); + NewIndVars[i] = NewIndVar; + + Leftover = Builder.CreateUDiv(Leftover, OrigTripCount); + } + // Outermost loop gets all the remaining bits. + NewIndVars[0] = Leftover; + + // Construct the loop body control flow. + // We progressively construct the branch structure following in direction of + // the control flow, from the leading in-between code, the loop nest body, the + // trailing in-between code, and rejoining the collapsed loop's latch. + // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If + // the ContinueBlock is set, continue with that block. If ContinuePred, use + // its predecessors as sources. + BasicBlock *ContinueBlock = Result->getBody(); + BasicBlock *ContinuePred = nullptr; + + // The code before the nested loop of each level. + // Because we are sinking it into the nest, it will be executed more often + // that the original loop. More sophisticated schemes could keep track of what + // the in-between code is and instantiate it only once per thread. + for (int i = 0; i < NumLoops - 1; ++i) { + CanonicalLoopInfo *Surrounding = Loops[i]; + CanonicalLoopInfo *Nested = Loops[i + 1]; + + if (ContinueBlock) + redirectTo(ContinueBlock, Surrounding->getBody()); + else + redirectAllPredecessorsTo(ContinuePred, Surrounding->getBody()); + ContinueBlock = nullptr; + ContinuePred = Nested->getHeader(); + } + + // Connect the loop nest body. + if (ContinueBlock) + redirectTo(ContinueBlock, Innermost->getBody()); + else + redirectAllPredecessorsTo(ContinuePred, Innermost->getBody()); + ContinueBlock = nullptr; + ContinuePred = Innermost->getLatch(); + + // The code after the nested loop at each level. + for (int i = NumLoops - 2; i >= 0; --i) { + CanonicalLoopInfo *Surrounding = Loops[i]; + CanonicalLoopInfo *Nested = Loops[i + 1]; + + if (ContinueBlock) + redirectTo(ContinueBlock, Nested->getAfter()); + else + redirectAllPredecessorsTo(ContinuePred, Nested->getAfter()); + ContinuePred = Surrounding->getLatch(); + ContinueBlock = nullptr; + } + + // Connect the finished loop to the collapsed loop latch. + if (ContinueBlock) + redirectTo(ContinueBlock, Result->getLatch()); + else + redirectAllPredecessorsTo(ContinuePred, Result->getLatch()); + + // Replace the input loops with the new collapsed loop. + redirectTo(Outermost->getPreheader(), Result->getPreheader()); + redirectTo(Result->getAfter(), Outermost->getAfter()); + + // Replace the input loop indvars with the derived ones. + for (int i = 0; i < NumLoops; ++i) + Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]); + + // Remove unused parts of the input loops. + SmallVector OldControlBBs; + OldControlBBs.reserve(6 * Loops.size()); + for (CanonicalLoopInfo *Loop : Loops) + Loop->collectControlBlocks(OldControlBBs); + removeUnusedBlocksFromParent(OldControlBBs); + +#ifndef NDEBUG + Result->assertOK(); +#endif + return Result; +} + OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, @@ -1552,6 +1747,16 @@ } } +void CanonicalLoopInfo::collectControlBlocks( + SmallVectorImpl &BBs) { + // We only count those BBs as control block for which we do not need to + // reverse the CFG, i.e. not the loop body which can contain arbitrary control + // flow. For consistency, this also means we do not add the Body block, which + // is just the entry to the body code. + BBs.reserve(BBs.size() + 6); + BBs.append({Preheader, Header, Cond, Latch, Exit, After}); +} + void CanonicalLoopInfo::assertOK() const { #ifndef NDEBUG if (!IsValid) diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -325,7 +325,7 @@ "Pred is not a predecessor!"); // Return early if there are no PHI nodes to update. - if (!isa(begin())) + if (empty() || !isa(begin())) return; unsigned NumPreds = cast(front()).getNumIncomingValues(); diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -23,6 +23,95 @@ namespace { +/// Create an instruction that uses the values in \p Values. We use "printf" +/// just because it is often used for this purpose in test code, but it is never +/// executed here. +static CallInst *createPrintfCall(IRBuilder<> &Builder, StringRef FormatStr, + ArrayRef Values) { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + + GlobalVariable *GV = Builder.CreateGlobalString(FormatStr, "", 0, M); + Constant *Zero = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0); + Constant *Indices[] = {Zero, Zero}; + Constant *FormatStrConst = + ConstantExpr::getInBoundsGetElementPtr(GV->getValueType(), GV, Indices); + + Function *PrintfDecl = M->getFunction("printf"); + if (!PrintfDecl) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), true); + PrintfDecl = Function::Create(Ty, Linkage, "printf", M); + } + + SmallVector Args; + Args.push_back(FormatStrConst); + Args.append(Values.begin(), Values.end()); + return Builder.CreateCall(PrintfDecl, Args); +} + +/// Verify that blocks in \p RefOrder are corresponds to the depth-first visit +/// order the control flow of \p F. +/// +/// This is an easy way to verify the branching structure of the CFG without +/// checking every branch instruction individually. For the CFG of a +/// CanonicalLoopInfo, the Cond BB's terminating branch's first edge is entering +/// the body, i.e. the DFS order corresponds to the execution order with one +/// loop iteration. +static testing::AssertionResult +verifyDFSOrder(Function *F, ArrayRef RefOrder) { + auto It = RefOrder.begin(); + auto E = RefOrder.end(); + + df_iterator_default_set Visited; + auto DFS = llvm::depth_first_ext(&F->getEntryBlock(), Visited); + + BasicBlock *Prev = nullptr; + for (BasicBlock *BB : DFS) { + if (It != E && BB == *It) { + Prev = *It; + ++It; + } + } + + if (It == E) + return testing::AssertionSuccess(); + if (!Prev) + return testing::AssertionFailure() + << "Did not find " << (*It)->getName() << " in control flow"; + return testing::AssertionFailure() + << "Expected " << Prev->getName() << " before " << (*It)->getName() + << " in control flow"; +} + +/// Verify that blocks in \p RefOrder are in the same relative order in the +/// linked lists of blocks in \p F. The linked list may contain additional +/// blocks in-between. +/// +/// While the order in the linked list is not relevant for semantics, keeping +/// the order roughly in execution order makes its printout easier to read. +static testing::AssertionResult +verifyListOrder(Function *F, ArrayRef RefOrder) { + auto It = RefOrder.begin(); + auto E = RefOrder.end(); + + BasicBlock *Prev = nullptr; + for (BasicBlock &BB : *F) { + if (It != E && &BB == *It) { + Prev = *It; + ++It; + } + } + + if (It == E) + return testing::AssertionSuccess(); + if (!Prev) + return testing::AssertionFailure() << "Did not find " << (*It)->getName() + << " in function " << F->getName(); + return testing::AssertionFailure() + << "Expected " << Prev->getName() << " before " << (*It)->getName() + << " in function " << F->getName(); +} + class OpenMPIRBuilderTest : public testing::Test { protected: void SetUp() override { @@ -1071,6 +1160,156 @@ EXPECT_FALSE(verifyModule(*M, &errs())); } +TEST_F(OpenMPIRBuilderTest, CollapseNestedLoops) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + F->setName("func"); + + IRBuilder<> Builder(BB); + + Type *LCTy = F->getArg(0)->getType(); + Constant *One = ConstantInt::get(LCTy, 1); + Constant *Two = ConstantInt::get(LCTy, 2); + Value *OuterTripCount = + Builder.CreateAdd(F->getArg(0), Two, "tripcount.outer"); + Value *InnerTripCount = + Builder.CreateAdd(F->getArg(0), One, "tripcount.inner"); + + // Fix an insertion point for ComputeIP. + BasicBlock *LoopNextEnter = + BasicBlock::Create(M->getContext(), "loopnest.enter", F, + Builder.GetInsertBlock()->getNextNode()); + BranchInst *EnterBr = Builder.CreateBr(LoopNextEnter); + InsertPointTy ComputeIP{EnterBr->getParent(), EnterBr->getIterator()}; + + Builder.SetInsertPoint(LoopNextEnter); + OpenMPIRBuilder::LocationDescription OuterLoc(Builder.saveIP(), DL); + + CanonicalLoopInfo *InnerLoop = nullptr; + CallInst *InbetweenLead = nullptr; + CallInst *InbetweenTrail = nullptr; + CallInst *Call = nullptr; + auto OuterLoopBodyGenCB = [&](InsertPointTy OuterCodeGenIP, Value *OuterLC) { + Builder.restoreIP(OuterCodeGenIP); + InbetweenLead = + createPrintfCall(Builder, "In-between lead i=%d\\n", {OuterLC}); + + auto InnerLoopBodyGenCB = [&](InsertPointTy InnerCodeGenIP, + Value *InnerLC) { + Builder.restoreIP(InnerCodeGenIP); + Call = createPrintfCall(Builder, "body i=%d j=%d\\n", {OuterLC, InnerLC}); + }; + InnerLoop = OMPBuilder.createCanonicalLoop( + Builder.saveIP(), InnerLoopBodyGenCB, InnerTripCount, "inner"); + + Builder.restoreIP(InnerLoop->getAfterIP()); + InbetweenTrail = + createPrintfCall(Builder, "In-between trail i=%d\\n", {OuterLC}); + }; + CanonicalLoopInfo *OuterLoop = OMPBuilder.createCanonicalLoop( + OuterLoc, OuterLoopBodyGenCB, OuterTripCount, "outer"); + + // Finish the function. + Builder.restoreIP(OuterLoop->getAfterIP()); + Builder.CreateRetVoid(); + + CanonicalLoopInfo *Collapsed = + OMPBuilder.collapseLoops(DL, {OuterLoop, InnerLoop}, ComputeIP); + + OMPBuilder.finalize(); + EXPECT_FALSE(verifyModule(*M, &errs())); + + // Verify control flow and BB order. + BasicBlock *RefOrder[] = { + Collapsed->getPreheader(), Collapsed->getHeader(), + Collapsed->getCond(), Collapsed->getBody(), + InbetweenLead->getParent(), Call->getParent(), + InbetweenTrail->getParent(), Collapsed->getLatch(), + Collapsed->getExit(), Collapsed->getAfter(), + }; + EXPECT_TRUE(verifyDFSOrder(F, RefOrder)); + EXPECT_TRUE(verifyListOrder(F, RefOrder)); + + // Verify the total trip count. + auto *TripCount = cast(Collapsed->getTripCount()); + EXPECT_EQ(TripCount->getOperand(0), OuterTripCount); + EXPECT_EQ(TripCount->getOperand(1), InnerTripCount); + + // Verify the changed indvar. + auto *OuterIV = cast(Call->getOperand(1)); + EXPECT_EQ(OuterIV->getOpcode(), Instruction::UDiv); + EXPECT_EQ(OuterIV->getParent(), Collapsed->getBody()); + EXPECT_EQ(OuterIV->getOperand(1), InnerTripCount); + EXPECT_EQ(OuterIV->getOperand(0), Collapsed->getIndVar()); + + auto *InnerIV = cast(Call->getOperand(2)); + EXPECT_EQ(InnerIV->getOpcode(), Instruction::URem); + EXPECT_EQ(InnerIV->getParent(), Collapsed->getBody()); + EXPECT_EQ(InnerIV->getOperand(0), Collapsed->getIndVar()); + EXPECT_EQ(InnerIV->getOperand(1), InnerTripCount); + + EXPECT_EQ(InbetweenLead->getOperand(1), OuterIV); + EXPECT_EQ(InbetweenTrail->getOperand(1), OuterIV); +} + +TEST_F(OpenMPIRBuilderTest, CollapseTripCounts) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + + IRBuilder<> Builder(BB); + + // Create a loop nest and collapse it. Return the total count which will be a + // constant due to IRBuilder's in-place arithmetic simplification. + auto EvalTripCount = [&](int64_t OuterTripCount, + int64_t InnerTripCount) -> uint64_t { + Type *LCTy = Type::getInt16Ty(Ctx); + Constant *OuterTripCountVal = ConstantInt::get(LCTy, OuterTripCount); + Constant *InnerTripCountVal = ConstantInt::get(LCTy, InnerTripCount); + + OpenMPIRBuilder::LocationDescription OuterLoc(Builder.saveIP(), DL); + CanonicalLoopInfo *OuterLoop = OMPBuilder.createCanonicalLoop( + OuterLoc, nullptr, OuterTripCountVal, "outer"); + + // Create the inner loop in the outer loop's body. + OpenMPIRBuilder::LocationDescription InnerLoc(OuterLoop->getBodyIP(), DL); + CanonicalLoopInfo *InnerLoop = OMPBuilder.createCanonicalLoop( + InnerLoc, nullptr, InnerTripCountVal, "outer"); + Builder.restoreIP(OuterLoop->getAfterIP()); + + CanonicalLoopInfo *Collapsed = + OMPBuilder.collapseLoops(DL, {OuterLoop, InnerLoop}, {}); + + // Extract the trip count. + Value *CollapsedTripCount = Collapsed->getTripCount(); + return cast(CollapsedTripCount)->getValue().getZExtValue(); + }; + + // Empty iteration domain. + EXPECT_EQ(EvalTripCount(0, 0), 0); + EXPECT_EQ(EvalTripCount(42, 0), 0); + EXPECT_EQ(EvalTripCount(0, 42), 0); + + // Usual cases. + EXPECT_EQ(EvalTripCount(1, 1), 1); + EXPECT_EQ(EvalTripCount(7, 5), 35); + + // Close to 16-bit integer range. + EXPECT_EQ(EvalTripCount(0xFFFE, 1), 0xFFFE); + EXPECT_EQ(EvalTripCount(1, 0xFFFE), 0xFFFE); + EXPECT_EQ(EvalTripCount(0x7FFF, 2), 0xFFFE); + EXPECT_EQ(EvalTripCount(2, 0x7FFF), 0xFFFE); + EXPECT_EQ(EvalTripCount(0xFF, 0x100), 0xFF00); + EXPECT_EQ(EvalTripCount(0x100, 0xFF), 0xFF00); + + // Finish and finalize the function. + Builder.CreateRetVoid(); + OMPBuilder.finalize(); + + EXPECT_FALSE(verifyModule(*M, &errs())); +} + TEST_F(OpenMPIRBuilderTest, StaticWorkShareLoop) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M);