diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -274,6 +274,70 @@ InsertPointTy ComputeIP = {}, const Twine &Name = "loop"); + /// Collapse a loop nest into a single loop. + /// + /// Merges loops of a loop nest into a single CanonicalLoopNest representation + /// that has the same number of innermost loop iterations as the origin loop + /// nest. The induction variables of the input loops are derived from the + /// collapsed loop's induction variable. This is intended to be used to + /// implement OpenMP's collapse clause. Before applying a directive, + /// collapseLoops normalizes a loop nest to contain only a single loop and the + /// directive's implementation does not need to handle multiple loops itself. + /// This does not remove the need to handle all loop nest handling by + /// directives, such as the ordered() clause or the simd schedule-clause + /// modifier of the worksharing-loop directive. + /// + /// Example: + /// \code + /// for (int i = 0; i < 7; ++i) // Canonical loop "i" + /// for (int j = 0; j < 9; ++j) // Canonical loop "j" + /// body(i, j); + /// \endcode + /// + /// After collapsing with Loops={i,j}, the loop is changed to + /// \code + /// for (int ij = 0; ij < 63; ++ij) { + /// int i = ij / 9; + /// int j = ij % 9; + /// body(i, j); + /// } + /// \endcode + /// + /// In the current implementation, the following limitations apply: + /// + /// * All input loops have an induction variable of the same type. + /// + /// * The collapsed loop will have the same trip count integer type as the + /// input loops. Therefore it is possible that the collapsed loop cannot + /// represent all iterations of the input loops. For instance, assuming a + /// 32 bit integer type, and two input loops both iterating 2^16 times, the + /// theoretical trip count of the collapsed loop would be 2^32 iteration, + /// which cannot be represented in an 32-bit integer. Behavior is undefined + /// in this case. + /// + /// * The trip counts of every input loop must be available at \p ComputeIP. + /// Non-rectangular loops are not yet supported. + /// + /// * At each nest level, code between a surrounding loop and its nested loop + /// is hoisted into the loop body, and such code will be executed more + /// often than before collapsing (or not at all if any inner loop iteration + /// has a trip count of 0). This is permitted by the OpenMP specification. + /// + /// \param DL Debug location for instructions added for collapsing, + /// such as instructions to compute derive the input loop's + /// induction variables. + /// \param Loops Loops in the loop nest to collapse. Loops are specified + /// from outermost-to-innermost and every control flow of a + /// loop's body must pass through its directly nested loop. + /// \param ComputeIP Where additional instruction that compute the collapsed + /// trip count. If not set, defaults to before the generated + /// loop. + /// + /// \returns The CanonicalLoopInfo object representing the collapsed loop. + CanonicalLoopInfo *collapseLoops(DebugLoc DL, + ArrayRef Loops, + InsertPointTy ComputeIP); + /// Modifies the canonical loop to be a statically-scheduled workshare loop. /// /// This takes a \p LoopInfo representing a canonical loop, such as the one diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -1225,6 +1225,132 @@ DeleteDeadBlocks(BBVec); } +CanonicalLoopInfo * +OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef Loops, + InsertPointTy ComputeIP) { + assert(Loops.size() >= 1 && "At least one loop required"); + size_t NumLoops = Loops.size(); + + // Nothing to do if there is already just one loop. + if (NumLoops == 1) + return Loops.front(); + + CanonicalLoopInfo *Outermost = Loops.front(); + CanonicalLoopInfo *Innermost = Loops.back(); + BasicBlock *OrigPreheader = Outermost->getPreheader(); + BasicBlock *OrigAfter = Outermost->getAfter(); + Function *F = OrigPreheader->getParent(); + + // Setup the IRBuilder for inserting the trip count computation. + Builder.SetCurrentDebugLocation(DL); + if (ComputeIP.isSet()) + Builder.restoreIP(ComputeIP); + else + Builder.restoreIP(Outermost->getPreheaderIP()); + + // Derive the collapsed' loop trip count. + // TODO: Find common/largest indvar type. + Value *CollapsedTripCount = nullptr; + for (CanonicalLoopInfo *L : Loops) { + Value *OrigTripCount = L->getTripCount(); + if (!CollapsedTripCount) { + CollapsedTripCount = OrigTripCount; + continue; + } + + // TODO: Enable UndefinedSanitizer to diagnose an overflow here. + CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount, + {}, /*HasNUW=*/true); + } + + // Create the collapsed loop control flow. + CanonicalLoopInfo *Result = + createLoopSkeleton(DL, CollapsedTripCount, F, + OrigPreheader->getNextNode(), OrigAfter, "collapsed"); + + // Build the collapsed loop body code. + // Start with deriving the input loop induction variables from the collapsed + // one, using a divmod scheme. To preserve the original loops' order, the + // innermost loop use the least significant bits. + Builder.restoreIP(Result->getBodyIP()); + + Value *Leftover = Result->getIndVar(); + SmallVector NewIndVars; + NewIndVars.set_size(NumLoops); + for (int i = NumLoops - 1; i >= 1; --i) { + Value *OrigTripCount = Loops[i]->getTripCount(); + + Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount); + NewIndVars[i] = NewIndVar; + + Leftover = Builder.CreateUDiv(Leftover, OrigTripCount); + } + // Outermost loop gets all the remaining bits. + NewIndVars[0] = Leftover; + + // Construct the loop body control flow. + // We progressively construct the branch structure following in direction of + // the control flow, from the leading in-between code, the loop nest body, the + // trailing in-between code, and rejoining the collapsed loop's latch. + // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If + // the ContinueBlock is set, continue with that block. If ContinuePred, use + // its predecessors as sources. + BasicBlock *ContinueBlock = Result->getBody(); + BasicBlock *ContinuePred = nullptr; + auto ContinueWith = [&ContinueBlock, &ContinuePred, + DL](BasicBlock *Dest, BasicBlock *NextSrc, + bool IncludeNextSrcBB) { + if (ContinueBlock) + redirectTo(ContinueBlock, Dest, DL); + else + redirectAllPredecessorsTo(ContinuePred, Dest, DL); + + ContinueBlock = nullptr; + ContinuePred = nullptr; + if (IncludeNextSrcBB) + ContinueBlock = NextSrc; + else + ContinuePred = NextSrc; + }; + + // The code before the nested loop of each level. + // Because we are sinking it into the nest, it will be executed more often + // that the original loop. More sophisticated schemes could keep track of what + // the in-between code is and instantiate it only once per thread. + for (int i = 0; i < NumLoops - 1; ++i) + ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader(), false); + + // Connect the loop nest body. + ContinueWith(Innermost->getBody(), Innermost->getLatch(), false); + + // The code after the nested loop at each level. + for (int i = NumLoops - 2; i >= 0; --i) + ContinueWith(Loops[i + 1]->getAfter(), Loops[i]->getLatch(), false); + + // Connect the finished loop to the collapsed loop latch. + ContinueWith(Result->getLatch(), nullptr, false); + + // Replace the input loops with the new collapsed loop. + redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL); + redirectTo(Result->getAfter(), Outermost->getAfter(), DL); + + // Replace the input loop indvars with the derived ones. + for (int i = 0; i < NumLoops; ++i) + Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]); + + // Remove unused parts of the input loops. + SmallVector OldControlBBs; + OldControlBBs.reserve(6 * Loops.size()); + for (CanonicalLoopInfo *Loop : Loops) + Loop->collectControlBlocks(OldControlBBs); + removeUnusedBlocksFromParent(OldControlBBs); + +#ifndef NDEBUG + Result->assertOK(); +#endif + return Result; +} + std::vector OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef Loops, ArrayRef TileSizes) { diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -1160,6 +1160,99 @@ EXPECT_FALSE(verifyModule(*M, &errs())); } +TEST_F(OpenMPIRBuilderTest, CollapseNestedLoops) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + F->setName("func"); + + IRBuilder<> Builder(BB); + + Type *LCTy = F->getArg(0)->getType(); + Constant *One = ConstantInt::get(LCTy, 1); + Constant *Two = ConstantInt::get(LCTy, 2); + Value *OuterTripCount = + Builder.CreateAdd(F->getArg(0), Two, "tripcount.outer"); + Value *InnerTripCount = + Builder.CreateAdd(F->getArg(0), One, "tripcount.inner"); + + // Fix an insertion point for ComputeIP. + BasicBlock *LoopNextEnter = + BasicBlock::Create(M->getContext(), "loopnest.enter", F, + Builder.GetInsertBlock()->getNextNode()); + BranchInst *EnterBr = Builder.CreateBr(LoopNextEnter); + InsertPointTy ComputeIP{EnterBr->getParent(), EnterBr->getIterator()}; + + Builder.SetInsertPoint(LoopNextEnter); + OpenMPIRBuilder::LocationDescription OuterLoc(Builder.saveIP(), DL); + + CanonicalLoopInfo *InnerLoop = nullptr; + CallInst *InbetweenLead = nullptr; + CallInst *InbetweenTrail = nullptr; + CallInst *Call = nullptr; + auto OuterLoopBodyGenCB = [&](InsertPointTy OuterCodeGenIP, Value *OuterLC) { + Builder.restoreIP(OuterCodeGenIP); + InbetweenLead = + createPrintfCall(Builder, "In-between lead i=%d\\n", {OuterLC}); + + auto InnerLoopBodyGenCB = [&](InsertPointTy InnerCodeGenIP, + Value *InnerLC) { + Builder.restoreIP(InnerCodeGenIP); + Call = createPrintfCall(Builder, "body i=%d j=%d\\n", {OuterLC, InnerLC}); + }; + InnerLoop = OMPBuilder.createCanonicalLoop( + Builder.saveIP(), InnerLoopBodyGenCB, InnerTripCount, "inner"); + + Builder.restoreIP(InnerLoop->getAfterIP()); + InbetweenTrail = + createPrintfCall(Builder, "In-between trail i=%d\\n", {OuterLC}); + }; + CanonicalLoopInfo *OuterLoop = OMPBuilder.createCanonicalLoop( + OuterLoc, OuterLoopBodyGenCB, OuterTripCount, "outer"); + + // Finish the function. + Builder.restoreIP(OuterLoop->getAfterIP()); + Builder.CreateRetVoid(); + + CanonicalLoopInfo *Collapsed = + OMPBuilder.collapseLoops(DL, {OuterLoop, InnerLoop}, ComputeIP); + + OMPBuilder.finalize(); + EXPECT_FALSE(verifyModule(*M, &errs())); + + // Verify control flow and BB order. + BasicBlock *RefOrder[] = { + Collapsed->getPreheader(), Collapsed->getHeader(), + Collapsed->getCond(), Collapsed->getBody(), + InbetweenLead->getParent(), Call->getParent(), + InbetweenTrail->getParent(), Collapsed->getLatch(), + Collapsed->getExit(), Collapsed->getAfter(), + }; + EXPECT_TRUE(verifyDFSOrder(F, RefOrder)); + EXPECT_TRUE(verifyListOrder(F, RefOrder)); + + // Verify the total trip count. + auto *TripCount = cast(Collapsed->getTripCount()); + EXPECT_EQ(TripCount->getOperand(0), OuterTripCount); + EXPECT_EQ(TripCount->getOperand(1), InnerTripCount); + + // Verify the changed indvar. + auto *OuterIV = cast(Call->getOperand(1)); + EXPECT_EQ(OuterIV->getOpcode(), Instruction::UDiv); + EXPECT_EQ(OuterIV->getParent(), Collapsed->getBody()); + EXPECT_EQ(OuterIV->getOperand(1), InnerTripCount); + EXPECT_EQ(OuterIV->getOperand(0), Collapsed->getIndVar()); + + auto *InnerIV = cast(Call->getOperand(2)); + EXPECT_EQ(InnerIV->getOpcode(), Instruction::URem); + EXPECT_EQ(InnerIV->getParent(), Collapsed->getBody()); + EXPECT_EQ(InnerIV->getOperand(0), Collapsed->getIndVar()); + EXPECT_EQ(InnerIV->getOperand(1), InnerTripCount); + + EXPECT_EQ(InbetweenLead->getOperand(1), OuterIV); + EXPECT_EQ(InbetweenTrail->getOperand(1), OuterIV); +} + TEST_F(OpenMPIRBuilderTest, TileSingleLoop) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M);