diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -201,15 +201,19 @@ /// convert the logical iteration variable to the loop counter variable in the /// loop body. /// - /// \param Loc The insert and source location description. + /// \param Loc The insert and source location description. The insert + /// location can be between two instructions or the end of a + /// degenerate block (e.g. a BB under construction). /// \param BodyGenCB Callback that will generate the loop body code. /// \param TripCount Number of iterations the loop body is executed. + /// \param Name Base name used to derive BB and instruction names. /// /// \returns An object representing the created control flow structure which /// can be used for loop-associated directives. CanonicalLoopInfo *createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, - Value *TripCount); + Value *TripCount, + const Twine &Name = "loop"); /// Generator for the control flow structure of an OpenMP canonical loop. /// @@ -233,7 +237,7 @@ /// for (uint8_t i = 100u; i > 0; i += 127u) /// /// - /// TODO: May need to add addtional parameters to represent: + /// TODO: May need to add additional parameters to represent: /// /// * Allow representing downcounting with unsigned integers. /// @@ -252,13 +256,20 @@ /// and Stop are signed integers. /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop /// counter. + /// \param ComputeIP Insertion point for instructions computing the trip + /// count. Can be used to ensure the trip count is available + /// at the outermost loop of a loop nest. If not set, + /// defaults to the preheader of the generated loop. + /// \param Name Base name used to derive BB and instruction names. /// /// \returns An object representing the created control flow structure which /// can be used for loop-associated directives. CanonicalLoopInfo *createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, - bool IsSigned, bool InclusiveStop); + bool IsSigned, bool InclusiveStop, + InsertPointTy ComputeIP = {}, + const Twine &Name = "loop"); /// Modifies the canonical loop to be a statically-scheduled workshare loop. /// @@ -286,6 +297,48 @@ bool NeedsBarrier, Value *Chunk = nullptr); + /// Tile a loop nest. + /// + /// Tiles the loops of \p CL by the tile sizes in \p Sizes. Loops in \p Loops + /// must be perfectly nested, from outermost to innermost loop (i.e. + /// Loops.front() is the outermost loop). The trip count llvm::Value of every + /// loop and every tile sizes must be usable in the outermost loop's + /// preheader. This implies that the loop nest is rectangular. + /// + /// Example: + /// \code + /// for (int i = 0; i < 15; ++i) // Canonical loop "i" + /// for (int j = 0; j < 14; ++j) // Canonical loop "j" + /// body(i, j); + /// \endcode + /// + /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to + /// \code + /// for (int i1 = 0; i1 < 3; ++i1) + /// for (int j1 = 0; j1 < 2; ++j) + /// for (int i2 = 0; i2 < 5; ++i2) + /// for (int j2 = 0; j2 < 7; ++j2) + /// body(i1*3+i2, j1*3+j2); + /// \endcode + /// + /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are + /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also + /// handles non-constant trip counts, non-constant tile sizes and trip counts + /// that are not multiples of the tile size. + /// + /// @param DL Debug location for instructions added by tiling, for + /// instance the floor- and tile trip count computation. + /// @param Loops Loop to tile. + /// @param TileSizes For each loop in \p Loops, the tile size for that + /// dimensions. + /// + /// \returns A list of generated loops. Contains twice as many loops as the + /// input loop nest; the first half are the floor loops and the + /// second half are the tile loops. + std::vector + tileLoops(DebugLoc DL, ArrayRef Loops, + ArrayRef TileSizes); + /// Generator for '#omp flush' /// /// \param Loc The location where the flush directive was encountered @@ -644,6 +697,28 @@ /// \param CriticalName Name of the critical region. /// Value *getOMPCriticalRegionLock(StringRef CriticalName); + + /// Create the control flow structure of a canonical OpenMP loop. + /// + /// The emitted loop will be disconnected, i.e. no edge to the loop's + /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's + /// IRBuilder location is not preserved. + /// + /// \param DL DebugLoc used for the instructions in the skeleton. + /// \param TripCount Value to be used for the trip count. + /// \param F Function in which to insert the BasicBlocks. + /// \param PreInsertBefore Where to insert BBs that execute before the body, + /// typically the body itself. + /// \param PostInsertBefore Where to insert BBs that execute after the body. + /// \param Name Base name used to derive BB + /// and instruction names. + /// + /// \returns The CanonicalLoopInfo that represents the emitted loop. + CanonicalLoopInfo *createLoopSkeleton(DebugLoc DL, Value *TripCount, + Function *F, + BasicBlock *PreInsertBefore, + BasicBlock *PostInsertBefore, + const Twine &Name = {}); }; /// Class to represented the control flow structure of an OpenMP canonical loop. @@ -693,8 +768,11 @@ BasicBlock *Exit; BasicBlock *After; - /// Delete this loop if unused. - void eraseFromParent(); + /// Add the control blocks of this loop to \p BBs. + /// + /// This does not include any block from the body, including the one returned + /// by getBody(). + void colleckControlBlocks(SmallVectorImpl &BBs); public: /// The preheader ensures that there is only a single edge entering the loop. @@ -748,6 +826,19 @@ return IndVarPHI; } + /// Return the type of the induction variable (and the trip count). + Type *getIndVarType() const { return getIndVar()->getType(); } + + /// Return the insertion point for user code before the loop. + OpenMPIRBuilder::InsertPointTy getPreheaderIP() const { + return {Preheader, std::prev(Preheader->end())}; + }; + + /// Return the insertion point for user code in the body. + OpenMPIRBuilder::InsertPointTy getBodyIP() const { + return {Body, Body->begin()}; + }; + /// Return the insertion point for user code after the loop. OpenMPIRBuilder::InsertPointTy getAfterIP() const { return {After, After->begin()}; diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -14,6 +14,7 @@ #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" +#include "llvm/ADT/Sequence.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" @@ -858,44 +859,43 @@ /*Conditional*/ true, /*hasFinalize*/ true); } -CanonicalLoopInfo * -OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc, - LoopBodyGenCallbackTy BodyGenCB, - Value *TripCount) { - BasicBlock *BB = Loc.IP.getBlock(); - BasicBlock *NextBB = BB->getNextNode(); - Function *F = BB->getParent(); +CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton( + DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, + BasicBlock *PostInsertBefore, const Twine &Name) { + Module *M = F->getParent(); + LLVMContext &Ctx = M->getContext(); Type *IndVarTy = TripCount->getType(); + Twine Prefix = Twine("omp_") + Name; // Create the basic block structure. BasicBlock *Preheader = - BasicBlock::Create(M.getContext(), "omp_for.preheader", F, NextBB); + BasicBlock::Create(Ctx, Prefix + ".preheader", F, PreInsertBefore); BasicBlock *Header = - BasicBlock::Create(M.getContext(), "omp_for.header", F, NextBB); + BasicBlock::Create(Ctx, Prefix + ".header", F, PreInsertBefore); BasicBlock *Cond = - BasicBlock::Create(M.getContext(), "omp_for.cond", F, NextBB); + BasicBlock::Create(Ctx, Prefix + ".cond", F, PreInsertBefore); BasicBlock *Body = - BasicBlock::Create(M.getContext(), "omp_for.body", F, NextBB); + BasicBlock::Create(Ctx, Prefix + ".body", F, PreInsertBefore); BasicBlock *Latch = - BasicBlock::Create(M.getContext(), "omp_for.inc", F, NextBB); + BasicBlock::Create(Ctx, Prefix + ".inc", F, PostInsertBefore); BasicBlock *Exit = - BasicBlock::Create(M.getContext(), "omp_for.exit", F, NextBB); + BasicBlock::Create(Ctx, Prefix + ".exit", F, PostInsertBefore); BasicBlock *After = - BasicBlock::Create(M.getContext(), "omp_for.after", F, NextBB); + BasicBlock::Create(Ctx, Prefix + ".after", F, PostInsertBefore); - updateToLocation(Loc); - Builder.CreateBr(Preheader); + // Use specified DebugLoc for new instructions. + Builder.SetCurrentDebugLocation(DL); Builder.SetInsertPoint(Preheader); Builder.CreateBr(Header); Builder.SetInsertPoint(Header); - PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_for.iv"); + PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, Prefix + ".iv"); IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader); Builder.CreateBr(Cond); Builder.SetInsertPoint(Cond); - Value *Cmp = Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_for.cmp"); + Value *Cmp = Builder.CreateICmpULT(IndVarPHI, TripCount, Prefix + ".cmp"); Builder.CreateCondBr(Cmp, Body, Exit); Builder.SetInsertPoint(Body); @@ -903,16 +903,13 @@ Builder.SetInsertPoint(Latch); Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1), - "omp_for.next", /*HasNUW=*/true); + Prefix + ".next", /*HasNUW=*/true); Builder.CreateBr(Header); IndVarPHI->addIncoming(Next, Latch); Builder.SetInsertPoint(Exit); Builder.CreateBr(After); - // After all control flow has been created, insert the body user code. - BodyGenCB(InsertPointTy(Body, Body->begin()), IndVarPHI); - // Remember and return the canonical control flow. LoopInfos.emplace_front(); CanonicalLoopInfo *CL = &LoopInfos.front(); @@ -933,9 +930,43 @@ return CL; } +CanonicalLoopInfo * +OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc, + LoopBodyGenCallbackTy BodyGenCB, + Value *TripCount, const Twine &Name) { + BasicBlock *BB = Loc.IP.getBlock(); + BasicBlock *NextBB = BB->getNextNode(); + + CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(), + NextBB, NextBB, Name); + BasicBlock *After = CL->getAfter(); + + // If location is not set, don't connect the loop. + if (updateToLocation(Loc)) { + // Split the loop at the insertion point: Branch to the preheader and move + // every following instruction to after the loop (the After BB). Also, the + // new successor is the loop's after block. + Builder.CreateBr(CL->Preheader); + After->getInstList().splice(After->begin(), BB->getInstList(), + Builder.GetInsertPoint(), BB->end()); + After->replaceSuccessorsPhiUsesWith(BB, After); + } + + // Emit the body content. We do it after connecting the loop to the CFG to + // avoid that the callback encounters degenerate BBs. + BodyGenCB(CL->getBodyIP(), CL->getIndVar()); + +#ifndef NDEBUG + CL->assertOK(); +#endif + return CL; +} + CanonicalLoopInfo *OpenMPIRBuilder::createCanonicalLoop( const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, - Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop) { + Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, + InsertPointTy ComputeIP, const Twine &Name) { + // Consider the following difficulties (assuming 8-bit signed integers): // * Adding \p Step to the loop counter which passes \p Stop may overflow: // DO I = 1, 100, 50 @@ -947,7 +978,9 @@ assert(IndVarTy == Stop->getType() && "Stop type mismatch"); assert(IndVarTy == Step->getType() && "Step type mismatch"); - updateToLocation(Loc); + LocationDescription ComputeLoc = + ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc; + updateToLocation(ComputeLoc); ConstantInt *Zero = ConstantInt::get(IndVarTy, 0); ConstantInt *One = ConstantInt::get(IndVarTy, 1); @@ -988,7 +1021,8 @@ InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Span, Incr); CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo); } - Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping); + Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping, + "omp_" + Name + ".tripcount"); auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) { Builder.restoreIP(CodeGenIP); @@ -996,7 +1030,8 @@ Value *IndVar = Builder.CreateAdd(Span, Start); BodyGenCB(Builder.saveIP(), IndVar); }; - return createCanonicalLoop(Builder.saveIP(), BodyGen, TripCount); + LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP(); + return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name); } // Returns an LLVM function to call for initializing loop bounds using OpenMP @@ -1111,18 +1146,249 @@ return CLI; } -void CanonicalLoopInfo::eraseFromParent() { - assert(IsValid && "can only erase previously valid loop cfg"); - IsValid = false; +/// Make \p Source branch to \p Target. +/// +/// Handles two situations: +/// * \p Source already has an unconditional branch. +/// * \p Source is a degenerate block. +static void redirectTo(BasicBlock *Source, BasicBlock *Target) { + if (Instruction *Term = Source->getTerminator()) { + auto *Br = cast(Term); + assert(!Br->isConditional()); + BasicBlock *Succ = Br->getSuccessor(0); + Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true); + Br->setSuccessor(0, Target); + return; + } - SmallVector BBsToRemove{Header, Cond, Latch, Exit}; - SmallVector InstsToRemove; + BranchInst::Create(Target, Source); +} - // Only remove preheader if not re-purposed somewhere else. - if (Preheader->getNumUses() == 0) - BBsToRemove.push_back(Preheader); +/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is, +/// after this \p OldTarget will be orphaned. +static void redirectAllPredecessorsTo(BasicBlock *OldTarget, + BasicBlock *NewTarget) { + for (auto Pred : make_early_inc_range(predecessors(OldTarget))) + redirectTo(Pred, NewTarget); +} + +/// Determine which blocks in \p BBs are reachable from outside and remove the +/// ones that are not reachable from the function. +static void removeUnusedBlocksFromParent(ArrayRef BBs) { + SmallPtrSet BBsToErase{BBs.begin(), BBs.end()}; + auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) { + for (Use &U : BB->uses()) { + auto UseInst = dyn_cast(U.getUser()); + if (!UseInst) + continue; + if (BBsToErase.count(UseInst->getParent())) + continue; + return true; + } + return false; + }; + + while (true) { + bool Changed = false; + for (BasicBlock *BB : make_early_inc_range(BBsToErase)) { + if (HasRemainingUses(BB)) { + BBsToErase.erase(BB); + Changed = true; + } + } + if (!Changed) + break; + } - DeleteDeadBlocks(BBsToRemove); + SmallVector BBVec(BBsToErase.begin(), BBsToErase.end()); + DeleteDeadBlocks(BBVec); +} + +std::vector +OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef Loops, + ArrayRef TileSizes) { + size_t NumLoops = Loops.size(); + assert(TileSizes.size() == NumLoops && + "Must pass as many tile sizes as there are loops"); + assert(NumLoops >= 1 && "At least one loop to tile required"); + + CanonicalLoopInfo *OutermostLoop = Loops.front(); + CanonicalLoopInfo *InnermostLoop = Loops.back(); + Function *F = OutermostLoop->getBody()->getParent(); + BasicBlock *InnerEnter = InnermostLoop->getBody(); + BasicBlock *InnerLatch = InnermostLoop->getLatch(); + + // Collect original trip counts and induction variable to be accessible by + // index. Also, the structure of the original loops is not preserved during + // the construction of the tiled loops, so do it before we scavenge the BBs of + // any original CanonicalLoopInfo. + SmallVector OrigTripCounts, OrigIndVars; + for (CanonicalLoopInfo *L : Loops) { + OrigTripCounts.push_back(L->getTripCount()); + OrigIndVars.push_back(L->getIndVar()); + } + + // Collect the code between loop headers. These may contain SSA definitions + // that are used in the loop nest body. To be usable with in the innermost + // body, these BasicBlocks will be sunk into the loop nest body. That is, + // these instructions may be executed more often than before the tiling. + // TODO: It would be sufficient to only sink them into body of the + // corresponding tile loop. + SmallVector, 4> InbetweenCode; + for (int i : seq(0, NumLoops - 1)) { + CanonicalLoopInfo *Surrounding = Loops[i]; + CanonicalLoopInfo *Nested = Loops[i + 1]; + + BasicBlock *EnterBB = Surrounding->getBody(); + BasicBlock *ExitBB = Nested->getHeader(); + InbetweenCode.emplace_back(EnterBB, ExitBB); + } + + // Compute the trip counts of the floor loops. + Builder.SetCurrentDebugLocation(DL); + Builder.restoreIP(OutermostLoop->getPreheaderIP()); + SmallVector FloorCount, FloorEpilogues, FloorRems; + for (auto i : seq(0, NumLoops)) { + Value *TileSize = TileSizes[i]; + Value *OrigTripCount = OrigTripCounts[i]; + Type *IVType = OrigTripCount->getType(); + + Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize); + Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize); + + // 0 if tripcount divides the tilesize, 1 otherwise. + // 1 means we need an additional iteration for a partial tile. + // + // Unfortunately we cannot just use the roundup-formula + // (tripcount + tilesize - 1)/tilesize + // because the summation might overflow. We do not want introduce undefined + // behavior when the untiled loop nest did not. + Value *FloorTripOverflow = + Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0)); + + FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType); + FloorTripCount = + Builder.CreateAdd(FloorTripCount, FloorTripOverflow, + "omp_floor" + Twine(i) + ".tripcount", true); + + // Remember some values for later use. + FloorEpilogues.push_back(FloorTripCount); + FloorCount.push_back(FloorTripCount); + FloorRems.push_back(FloorTripRem); + } + + // Generate the new loop nest, from the outermost to the innermost. + std::vector Result; + Result.reserve(NumLoops * 2); + + // The basic block of the surrounding loop that enters the nest generated + // loop. + BasicBlock *Enter = OutermostLoop->getPreheader(); + + // The basic block of the surrounding loop where the inner code should + // continue. + BasicBlock *Continue = OutermostLoop->getAfter(); + + // Where the next loop basic block should be inserted. + BasicBlock *OutroInsertBefore = InnermostLoop->getExit(); + + // Create the floor loops. + for (auto i : seq(0, NumLoops)) { + Value *FloorTripCount = FloorCount[i]; + + Builder.SetInsertPoint(InnerEnter, InnerEnter->begin()); + CanonicalLoopInfo *FloorLoop = + createLoopSkeleton(DL, FloorTripCount, F, InnerEnter, OutroInsertBefore, + "floor" + Twine(i)); + redirectTo(Enter, FloorLoop->getPreheader()); + redirectTo(FloorLoop->getAfter(), Continue); + + Result.push_back(FloorLoop); + Enter = FloorLoop->getBody(); + Continue = FloorLoop->getLatch(); + OutroInsertBefore = FloorLoop->getLatch(); + } + + // Within the innermost floor loop, emit the code that computes the tile + // sizes. + Builder.SetInsertPoint(Enter->getTerminator()); + SmallVector TileCounts; + for (auto i : seq(0, NumLoops)) { + CanonicalLoopInfo *FloorLoop = Result[i]; + Value *TileSize = TileSizes[i]; + + Value *FloorIsEpilogue = + Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorEpilogues[i]); + Value *TileTripCount = + Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize); + + TileCounts.push_back(TileTripCount); + } + + // Create the tile loops. + for (auto i : seq(0, NumLoops)) { + CanonicalLoopInfo *TileLoop = createLoopSkeleton( + DL, TileCounts[i], F, InnerEnter, OutroInsertBefore, "tile" + Twine(i)); + redirectTo(Enter, TileLoop->getPreheader()); + redirectTo(TileLoop->getAfter(), Continue); + + Result.push_back(TileLoop); + Enter = TileLoop->getBody(); + Continue = TileLoop->getLatch(); + OutroInsertBefore = TileLoop->getLatch(); + } + + // Insert the inbetween code into the body. + BasicBlock *BodyEnter = Enter; + BasicBlock *BodyEntered = nullptr; + for (auto P : InbetweenCode) { + BasicBlock *EnterBB = P.first; + BasicBlock *ExitBB = P.second; + + if (BodyEnter) + redirectTo(BodyEnter, EnterBB); + else + redirectAllPredecessorsTo(BodyEntered, EnterBB); + + BodyEnter = nullptr; + BodyEntered = ExitBB; + } + + // Append the original loop nest body into the generated loop nest body. + if (BodyEnter) + redirectTo(BodyEnter, InnerEnter); + else + redirectAllPredecessorsTo(BodyEntered, InnerEnter); + redirectAllPredecessorsTo(InnerLatch, Continue); + + // Replace the original induction variable with an induction variable computed + // from the tile and floor induction variables. + Builder.restoreIP(Result.back()->getBodyIP()); + for (auto i : seq(0, NumLoops)) { + CanonicalLoopInfo *FloorLoop = Result[i]; + CanonicalLoopInfo *TileLoop = Result[NumLoops + i]; + Value *OrigIndVar = OrigIndVars[i]; + Value *Size = TileSizes[i]; + + Value *Scale = + Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true); + Value *Shift = + Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true); + OrigIndVar->replaceAllUsesWith(Shift); + } + + // Remove unused parts of the original loops. + SmallVector OldControlBBs; + OldControlBBs.reserve(6 * Loops.size()); + for (CanonicalLoopInfo *Loop : Loops) + Loop->colleckControlBlocks(OldControlBBs); + removeUnusedBlocksFromParent(OldControlBBs); + +#ifndef NDEBUG + for (CanonicalLoopInfo *GenL : Result) + GenL->assertOK(); +#endif + return Result; } OpenMPIRBuilder::InsertPointTy @@ -1531,6 +1797,16 @@ } } +void CanonicalLoopInfo::colleckControlBlocks( + SmallVectorImpl &BBs) { + // We only count those BBs as control block for which we do not need to + // reverse the CFG, i.e. not the loop body which can contain arbitrary control + // flow. For consistency, this also means we do not add the Body block, which + // is just the entry to the body code. + BBs.reserve(BBs.size() + 6); + BBs.append({Preheader, Header, Cond, Latch, Exit, After}); +} + void CanonicalLoopInfo::assertOK() const { #ifndef NDEBUG if (!IsValid) @@ -1565,11 +1841,16 @@ assert(Body); assert(Body->getSinglePredecessor() == Cond && "Body only reachable from exiting block"); + assert(!isa(Body->front())); assert(Latch); assert(isa(Latch->getTerminator()) && "Latch must terminate with unconditional branch"); assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header"); + // TODO: To support simple redirecting of the end of the body code that has + // multiple; introduce another auxiliary basic block like preheader and after. + assert(Latch->getSinglePredecessor() != nullptr); + assert(!isa(Latch->front())); assert(Exit); assert(isa(Exit->getTerminator()) && @@ -1580,6 +1861,7 @@ assert(After); assert(After->getSinglePredecessor() == Exit && "After block only reachable from exit block"); + assert(After->empty() || !isa(After->front())); Instruction *IndVar = getIndVar(); assert(IndVar && "Canonical induction variable not found?"); @@ -1587,6 +1869,17 @@ "Induction variable must be an integer"); assert(cast(IndVar)->getParent() == Header && "Induction variable must be a PHI in the loop header"); + assert(cast(IndVar)->getIncomingBlock(0) == Preheader); + assert( + cast(cast(IndVar)->getIncomingValue(0))->isZero()); + assert(cast(IndVar)->getIncomingBlock(1) == Latch); + + auto *NextIndVar = cast(IndVar)->getIncomingValue(1); + assert(cast(NextIndVar)->getParent() == Latch); + assert(cast(NextIndVar)->getOpcode() == BinaryOperator::Add); + assert(cast(NextIndVar)->getOperand(0) == IndVar); + assert(cast(cast(NextIndVar)->getOperand(1)) + ->isOne()); Value *TripCount = getTripCount(); assert(TripCount && "Loop trip count not found?"); diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -325,7 +325,7 @@ "Pred is not a predecessor!"); // Return early if there are no PHI nodes to update. - if (!isa(begin())) + if (empty() || !isa(begin())) return; unsigned NumPreds = cast(front()).getNumIncomingValues(); diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -23,6 +23,95 @@ namespace { +/// Create an instruction that uses the values in \p Values. We use "printf" +/// just because it is often used for this purpose in test code, but it is never +/// executed here. +static CallInst *createPrintfCall(IRBuilder<> &Builder, StringRef FormatStr, + ArrayRef Values) { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + + GlobalVariable *GV = Builder.CreateGlobalString(FormatStr, "", 0, M); + Constant *Zero = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0); + Constant *Indices[] = {Zero, Zero}; + Constant *FormatStrConst = + ConstantExpr::getInBoundsGetElementPtr(GV->getValueType(), GV, Indices); + + Function *PrintfDecl = M->getFunction("printf"); + if (!PrintfDecl) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), true); + PrintfDecl = Function::Create(Ty, Linkage, "printf", M); + } + + SmallVector Args; + Args.push_back(FormatStrConst); + Args.append(Values.begin(), Values.end()); + return Builder.CreateCall(PrintfDecl, Args); +} + +/// Verify that blocks in \p RefOrder are corresponds to the depth-first visit +/// order the control flow of \p F. +/// +/// This is an easy way to verify the branching structure of the CFG without +/// checking every branch instruction individually. For the CFG of a +/// CanonicalLoopInfo, the Cond BB's terminating branch's first edge is entering +/// the body, i.e. the DFS order corresponds to the execution order with one +/// loop iteration. +static testing::AssertionResult +verifyDFSOrder(Function *F, ArrayRef RefOrder) { + auto It = RefOrder.begin(); + auto E = RefOrder.end(); + + df_iterator_default_set Visited; + auto DFS = llvm::depth_first_ext(&F->getEntryBlock(), Visited); + + BasicBlock *Prev = nullptr; + for (BasicBlock *BB : DFS) { + if (It != E && BB == *It) { + Prev = *It; + ++It; + } + } + + if (It == E) + return testing::AssertionSuccess(); + if (!Prev) + return testing::AssertionFailure() + << "Did not find " << (*It)->getName() << " in control flow"; + return testing::AssertionFailure() + << "Expected " << Prev->getName() << " before " << (*It)->getName() + << " in control flow"; +} + +/// Verify that blocks in \p RefOrder are in the same relative order in the +/// linked lists of blocks in \p F. The linked list may contain additional +/// blocks in-between. +/// +/// While the order in the linked list is not relevant for semantics, keeping +/// the order roughly in execution order makes its printout easier to read. +static testing::AssertionResult +verifyListOrder(Function *F, ArrayRef RefOrder) { + auto It = RefOrder.begin(); + auto E = RefOrder.end(); + + BasicBlock *Prev = nullptr; + for (BasicBlock &BB : *F) { + if (It != E && &BB == *It) { + Prev = *It; + ++It; + } + } + + if (It == E) + return testing::AssertionSuccess(); + if (!Prev) + return testing::AssertionFailure() << "Did not find " << (*It)->getName() + << " in function " << F->getName(); + return testing::AssertionFailure() + << "Expected " << Prev->getName() << " before " << (*It)->getName() + << " in function " << F->getName(); +} + class OpenMPIRBuilderTest : public testing::Test { protected: void SetUp() override { @@ -1071,6 +1160,366 @@ EXPECT_FALSE(verifyModule(*M, &errs())); } +TEST_F(OpenMPIRBuilderTest, TileSingleLoop) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + F->setName("func"); + + IRBuilder<> Builder(BB); + OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); + Value *TripCount = F->getArg(0); + + BasicBlock *BodyCode = nullptr; + Instruction *Call = nullptr; + auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, llvm::Value *LC) { + Builder.restoreIP(CodeGenIP); + BodyCode = Builder.GetInsertBlock(); + + // Add something that consumes the induction variable to the body. + Call = createPrintfCall(Builder, "%d\\n", {LC}); + }; + CanonicalLoopInfo *Loop = + OMPBuilder.createCanonicalLoop(Loc, LoopBodyGenCB, TripCount); + + // Finalize the function. + Builder.restoreIP(Loop->getAfterIP()); + Builder.CreateRetVoid(); + + Instruction *OrigIndVar = Loop->getIndVar(); + EXPECT_EQ(Call->getOperand(1), OrigIndVar); + + // Tile the loop. + Constant *TileSize = ConstantInt::get(Loop->getIndVarType(), APInt(32, 7)); + std::vector GenLoops = + OMPBuilder.tileLoops(DL, {Loop}, {TileSize}); + + OMPBuilder.finalize(); + EXPECT_FALSE(verifyModule(*M, &errs())); + + EXPECT_EQ(GenLoops.size(), 2); + CanonicalLoopInfo *Floor = GenLoops[0]; + CanonicalLoopInfo *Tile = GenLoops[1]; + + BasicBlock *RefOrder[] = { + Floor->getPreheader(), Floor->getHeader(), Floor->getCond(), + Floor->getBody(), Tile->getPreheader(), Tile->getHeader(), + Tile->getCond(), Tile->getBody(), BodyCode, + Tile->getLatch(), Tile->getExit(), Tile->getAfter(), + Floor->getLatch(), Floor->getExit(), Floor->getAfter(), + }; + EXPECT_TRUE(verifyDFSOrder(F, RefOrder)); + EXPECT_TRUE(verifyListOrder(F, RefOrder)); + + // Check the induction variable. + EXPECT_EQ(Call->getParent(), BodyCode); + auto *Shift = cast(Call->getOperand(1)); + EXPECT_EQ(cast(Shift)->getParent(), Tile->getBody()); + EXPECT_EQ(Shift->getOperand(1), Tile->getIndVar()); + auto *Scale = cast(Shift->getOperand(0)); + EXPECT_EQ(cast(Scale)->getParent(), Tile->getBody()); + EXPECT_EQ(Scale->getOperand(0), TileSize); + EXPECT_EQ(Scale->getOperand(1), Floor->getIndVar()); +} + +TEST_F(OpenMPIRBuilderTest, TileNestedLoops) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + F->setName("func"); + + IRBuilder<> Builder(BB); + OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); + Value *TripCount = F->getArg(0); + Type *LCTy = TripCount->getType(); + + BasicBlock *BodyCode = nullptr; + CanonicalLoopInfo *InnerLoop = nullptr; + auto OuterLoopBodyGenCB = [&](InsertPointTy OuterCodeGenIP, + llvm::Value *OuterLC) { + auto InnerLoopBodyGenCB = [&](InsertPointTy InnerCodeGenIP, + llvm::Value *InnerLC) { + Builder.restoreIP(InnerCodeGenIP); + BodyCode = Builder.GetInsertBlock(); + + // Add something that consumes the induction variables to the body. + createPrintfCall(Builder, "i=%d j=%d\\n", {OuterLC, InnerLC}); + }; + InnerLoop = OMPBuilder.createCanonicalLoop( + OuterCodeGenIP, InnerLoopBodyGenCB, TripCount, "inner"); + }; + CanonicalLoopInfo *OuterLoop = OMPBuilder.createCanonicalLoop( + Loc, OuterLoopBodyGenCB, TripCount, "outer"); + + // Finalize the function. + Builder.restoreIP(OuterLoop->getAfterIP()); + Builder.CreateRetVoid(); + + // Tile to loop nest. + Constant *OuterTileSize = ConstantInt::get(LCTy, APInt(32, 11)); + Constant *InnerTileSize = ConstantInt::get(LCTy, APInt(32, 7)); + std::vector GenLoops = OMPBuilder.tileLoops( + DL, {OuterLoop, InnerLoop}, {OuterTileSize, InnerTileSize}); + + OMPBuilder.finalize(); + EXPECT_FALSE(verifyModule(*M, &errs())); + + EXPECT_EQ(GenLoops.size(), 4); + CanonicalLoopInfo *Floor1 = GenLoops[0]; + CanonicalLoopInfo *Floor2 = GenLoops[1]; + CanonicalLoopInfo *Tile1 = GenLoops[2]; + CanonicalLoopInfo *Tile2 = GenLoops[3]; + + BasicBlock *RefOrder[] = { + Floor1->getPreheader(), + Floor1->getHeader(), + Floor1->getCond(), + Floor1->getBody(), + Floor2->getPreheader(), + Floor2->getHeader(), + Floor2->getCond(), + Floor2->getBody(), + Tile1->getPreheader(), + Tile1->getHeader(), + Tile1->getCond(), + Tile1->getBody(), + Tile2->getPreheader(), + Tile2->getHeader(), + Tile2->getCond(), + Tile2->getBody(), + BodyCode, + Tile2->getLatch(), + Tile2->getExit(), + Tile2->getAfter(), + Tile1->getLatch(), + Tile1->getExit(), + Tile1->getAfter(), + Floor2->getLatch(), + Floor2->getExit(), + Floor2->getAfter(), + Floor1->getLatch(), + Floor1->getExit(), + Floor1->getAfter(), + }; + EXPECT_TRUE(verifyDFSOrder(F, RefOrder)); + EXPECT_TRUE(verifyListOrder(F, RefOrder)); +} + +TEST_F(OpenMPIRBuilderTest, TileNestedLoopsWithBounds) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + F->setName("func"); + + IRBuilder<> Builder(BB); + Value *TripCount = F->getArg(0); + Type *LCTy = TripCount->getType(); + + Value *OuterStartVal = ConstantInt::get(LCTy, {2}); + Value *OuterStopVal = TripCount; + Value *OuterStep = ConstantInt::get(LCTy, {5}); + Value *InnerStartVal = ConstantInt::get(LCTy, {13}); + Value *InnerStopVal = TripCount; + Value *InnerStep = ConstantInt::get(LCTy, {3}); + + // Fix an insertion point for ComputeIP. + BasicBlock *LoopNextEnter = + BasicBlock::Create(M->getContext(), "loopnest.enter", F, + Builder.GetInsertBlock()->getNextNode()); + BranchInst *EnterBr = Builder.CreateBr(LoopNextEnter); + InsertPointTy ComputeIP{EnterBr->getParent(), EnterBr->getIterator()}; + + InsertPointTy LoopIP{LoopNextEnter, LoopNextEnter->begin()}; + OpenMPIRBuilder::LocationDescription Loc({LoopIP, DL}); + + BasicBlock *BodyCode = nullptr; + CanonicalLoopInfo *InnerLoop = nullptr; + CallInst *Call = nullptr; + auto OuterLoopBodyGenCB = [&](InsertPointTy OuterCodeGenIP, + llvm::Value *OuterLC) { + auto InnerLoopBodyGenCB = [&](InsertPointTy InnerCodeGenIP, + llvm::Value *InnerLC) { + Builder.restoreIP(InnerCodeGenIP); + BodyCode = Builder.GetInsertBlock(); + + // Add something that consumes the induction variable to the body. + Call = createPrintfCall(Builder, "i=%d j=%d\\n", {OuterLC, InnerLC}); + }; + InnerLoop = OMPBuilder.createCanonicalLoop( + OuterCodeGenIP, InnerLoopBodyGenCB, InnerStartVal, InnerStopVal, + InnerStep, false, false, ComputeIP, "inner"); + }; + CanonicalLoopInfo *OuterLoop = OMPBuilder.createCanonicalLoop( + Loc, OuterLoopBodyGenCB, OuterStartVal, OuterStopVal, OuterStep, false, + false, ComputeIP, "outer"); + + // Finalize the function + Builder.restoreIP(OuterLoop->getAfterIP()); + Builder.CreateRetVoid(); + + // Tile the loop nest. + Constant *TileSize0 = ConstantInt::get(LCTy, APInt(32, 11)); + Constant *TileSize1 = ConstantInt::get(LCTy, APInt(32, 7)); + std::vector GenLoops = + OMPBuilder.tileLoops(DL, {OuterLoop, InnerLoop}, {TileSize0, TileSize1}); + + OMPBuilder.finalize(); + EXPECT_FALSE(verifyModule(*M, &errs())); + + EXPECT_EQ(GenLoops.size(), 4); + CanonicalLoopInfo *Floor0 = GenLoops[0]; + CanonicalLoopInfo *Floor1 = GenLoops[1]; + CanonicalLoopInfo *Tile0 = GenLoops[2]; + CanonicalLoopInfo *Tile1 = GenLoops[3]; + + BasicBlock *RefOrder[] = { + Floor0->getPreheader(), + Floor0->getHeader(), + Floor0->getCond(), + Floor0->getBody(), + Floor1->getPreheader(), + Floor1->getHeader(), + Floor1->getCond(), + Floor1->getBody(), + Tile0->getPreheader(), + Tile0->getHeader(), + Tile0->getCond(), + Tile0->getBody(), + Tile1->getPreheader(), + Tile1->getHeader(), + Tile1->getCond(), + Tile1->getBody(), + BodyCode, + Tile1->getLatch(), + Tile1->getExit(), + Tile1->getAfter(), + Tile0->getLatch(), + Tile0->getExit(), + Tile0->getAfter(), + Floor1->getLatch(), + Floor1->getExit(), + Floor1->getAfter(), + Floor0->getLatch(), + Floor0->getExit(), + Floor0->getAfter(), + }; + EXPECT_TRUE(verifyDFSOrder(F, RefOrder)); + EXPECT_TRUE(verifyListOrder(F, RefOrder)); + + EXPECT_EQ(Call->getParent(), BodyCode); + + auto *RangeShift0 = cast(Call->getOperand(1)); + EXPECT_EQ(RangeShift0->getOperand(1), OuterStartVal); + auto *RangeScale0 = cast(RangeShift0->getOperand(0)); + EXPECT_EQ(RangeScale0->getOperand(1), OuterStep); + auto *TileShift0 = cast(RangeScale0->getOperand(0)); + EXPECT_EQ(cast(TileShift0)->getParent(), Tile1->getBody()); + EXPECT_EQ(TileShift0->getOperand(1), Tile0->getIndVar()); + auto *TileScale0 = cast(TileShift0->getOperand(0)); + EXPECT_EQ(cast(TileScale0)->getParent(), Tile1->getBody()); + EXPECT_EQ(TileScale0->getOperand(0), TileSize0); + EXPECT_EQ(TileScale0->getOperand(1), Floor0->getIndVar()); + + auto *RangeShift1 = cast(Call->getOperand(2)); + EXPECT_EQ(cast(RangeShift1)->getParent(), BodyCode); + EXPECT_EQ(RangeShift1->getOperand(1), InnerStartVal); + auto *RangeScale1 = cast(RangeShift1->getOperand(0)); + EXPECT_EQ(cast(RangeScale1)->getParent(), BodyCode); + EXPECT_EQ(RangeScale1->getOperand(1), InnerStep); + auto *TileShift1 = cast(RangeScale1->getOperand(0)); + EXPECT_EQ(cast(TileShift1)->getParent(), Tile1->getBody()); + EXPECT_EQ(TileShift1->getOperand(1), Tile1->getIndVar()); + auto *TileScale1 = cast(TileShift1->getOperand(0)); + EXPECT_EQ(cast(TileScale1)->getParent(), Tile1->getBody()); + EXPECT_EQ(TileScale1->getOperand(0), TileSize1); + EXPECT_EQ(TileScale1->getOperand(1), Floor1->getIndVar()); +} + +TEST_F(OpenMPIRBuilderTest, TileSingleLoopCounts) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + IRBuilder<> Builder(BB); + + // Create a loop, tile it, and extract its trip count. All input values are + // constant and IRBuilder evaluates all-constant arithmetic inplace, such that + // the floor trip count itself will be a ConstantInt. Unfortunately we cannot + // do the same for the tile loop. + auto GetFloorCount = [&](int64_t Start, int64_t Stop, int64_t Step, + bool IsSigned, bool InclusiveStop, + int64_t TileSize) -> uint64_t { + OpenMPIRBuilder::LocationDescription Loc(Builder.saveIP(), DL); + Type *LCTy = Type::getInt16Ty(Ctx); + Value *StartVal = ConstantInt::get(LCTy, Start); + Value *StopVal = ConstantInt::get(LCTy, Stop); + Value *StepVal = ConstantInt::get(LCTy, Step); + + // Generate a loop. + auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, llvm::Value *LC) {}; + CanonicalLoopInfo *Loop = + OMPBuilder.createCanonicalLoop(Loc, LoopBodyGenCB, StartVal, StopVal, + StepVal, IsSigned, InclusiveStop); + + // Tile the loop. + Value *TileSizeVal = ConstantInt::get(LCTy, TileSize); + std::vector GenLoops = + OMPBuilder.tileLoops(Loc.DL, {Loop}, {TileSizeVal}); + + // Set the insertion pointer to after loop, where the next loop will be + // emitted. + Builder.restoreIP(Loop->getAfterIP()); + + // Extract the trip count. + CanonicalLoopInfo *FloorLoop = GenLoops[0]; + Value *FloorTripCount = FloorLoop->getTripCount(); + return cast(FloorTripCount)->getValue().getZExtValue(); + }; + + // Empty iteration domain. + EXPECT_EQ(GetFloorCount(0, 0, 1, false, false, 7), 0); + EXPECT_EQ(GetFloorCount(0, -1, 1, false, true, 7), 0); + EXPECT_EQ(GetFloorCount(-1, -1, -1, true, false, 7), 0); + EXPECT_EQ(GetFloorCount(-1, 0, -1, true, true, 7), 0); + EXPECT_EQ(GetFloorCount(-1, -1, 3, true, false, 7), 0); + + // Only complete tiles. + EXPECT_EQ(GetFloorCount(0, 14, 1, false, false, 7), 2); + EXPECT_EQ(GetFloorCount(0, 14, 1, false, false, 7), 2); + EXPECT_EQ(GetFloorCount(1, 15, 1, false, false, 7), 2); + EXPECT_EQ(GetFloorCount(0, -14, -1, true, false, 7), 2); + EXPECT_EQ(GetFloorCount(-1, -14, -1, true, true, 7), 2); + EXPECT_EQ(GetFloorCount(0, 3 * 7 * 2, 3, false, false, 7), 2); + + // Only a partial tile. + EXPECT_EQ(GetFloorCount(0, 1, 1, false, false, 7), 1); + EXPECT_EQ(GetFloorCount(0, 6, 1, false, false, 7), 1); + EXPECT_EQ(GetFloorCount(-1, 1, 3, true, false, 7), 1); + EXPECT_EQ(GetFloorCount(-1, -2, -1, true, false, 7), 1); + EXPECT_EQ(GetFloorCount(0, 2, 3, false, false, 7), 1); + + // Complete and partial tiles. + EXPECT_EQ(GetFloorCount(0, 13, 1, false, false, 7), 2); + EXPECT_EQ(GetFloorCount(0, 15, 1, false, false, 7), 3); + EXPECT_EQ(GetFloorCount(-1, -14, -1, true, false, 7), 2); + EXPECT_EQ(GetFloorCount(0, 3 * 7 * 5 - 1, 3, false, false, 7), 5); + EXPECT_EQ(GetFloorCount(-1, -3 * 7 * 5, -3, true, false, 7), 5); + + // Close to 16-bit integer range. + EXPECT_EQ(GetFloorCount(0, 0xFFFF, 1, false, false, 1), 0xFFFF); + EXPECT_EQ(GetFloorCount(0, 0xFFFF, 1, false, false, 7), 0xFFFF / 7 + 1); + EXPECT_EQ(GetFloorCount(0, 0xFFFE, 1, false, true, 7), 0xFFFF / 7 + 1); + EXPECT_EQ(GetFloorCount(-0x8000, 0x7FFF, 1, true, false, 7), 0xFFFF / 7 + 1); + EXPECT_EQ(GetFloorCount(-0x7FFF, 0x7FFF, 1, true, true, 7), 0xFFFF / 7 + 1); + EXPECT_EQ(GetFloorCount(0, 0xFFFE, 1, false, false, 0xFFFF), 1); + EXPECT_EQ(GetFloorCount(-0x8000, 0x7FFF, 1, true, false, 0xFFFF), 1); + + // Finalize the function. + Builder.CreateRetVoid(); + OMPBuilder.finalize(); + + EXPECT_FALSE(verifyModule(*M, &errs())); +} + TEST_F(OpenMPIRBuilderTest, StaticWorkShareLoop) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M);