diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp --- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp @@ -135,6 +135,89 @@ } } +/// Clone a single entry single exit set of blocks \p Blocks. \p FirstBlock is +/// expected to be the single entry of \p Blocks. \p Blocks is expected to be +/// contained in a loop. +/// +/// Updates LoopInfo and DominatorTree assuming \p Blocks are dominated by block +/// \p BlocksDomBB. Insert the new blocks before block specified in \p Before. +static void cloneBasicBlocksInLoop(BasicBlock *Before, BasicBlock *BlocksDomBB, + BasicBlock *FirstBlock, + SmallPtrSetImpl &Blocks, + ValueToValueMapTy &VMap, + const Twine &NameSuffix, LoopInfo *LI, + DominatorTree *DT, + SmallVectorImpl &NewBlocks) { + assert(!Blocks.empty() && "Expecting non-empty Blocks"); + assert(Blocks.count(FirstBlock) && + "Expecting FirstBlock to be part of Blocks"); + Function *F = FirstBlock->getParent(); + Loop *L = LI->getLoopFor(FirstBlock); + assert(L && "Expecting FirstBlock to be in a loop"); + DenseMap LMap; + LMap[L] = L; + + BasicBlock *NewFirstBlock = CloneBasicBlock(FirstBlock, VMap, NameSuffix, F); + VMap[FirstBlock] = NewFirstBlock; + L->addBasicBlockToLoop(NewFirstBlock, *LI); + DT->addNewBlock(NewFirstBlock, BlocksDomBB); + NewBlocks.push_back(NewFirstBlock); + + // Allocate loops that are descendants of L, and contained in Blocks. + for (Loop *CurLoop : L->getLoopsInPreorder()) { + if (!Blocks.count(CurLoop->getHeader())) + continue; + + Loop *&NewLoop = LMap[CurLoop]; + if (!NewLoop) { + NewLoop = LI->AllocateLoop(); + + // Establish the parent/child relationship. + Loop *OrigParent = CurLoop->getParentLoop(); + assert(OrigParent && "Could not find the original parent loop"); + Loop *NewParentLoop = LMap[OrigParent]; + assert(NewParentLoop && "Could not find the new parent loop"); + + NewParentLoop->addChildLoop(NewLoop); + } + } + + // Clone Blocks. + for (BasicBlock *BB : Blocks) { + if (BB == FirstBlock) + continue; + + Loop *CurLoop = LI->getLoopFor(BB); + assert(CurLoop && "Expecting BB to be in a loop"); + Loop *NewLoop = LMap[CurLoop]; + assert(NewLoop && "Expecting new loop to be allocated"); + BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F); + VMap[BB] = NewBB; + + NewLoop->addBasicBlockToLoop(NewBB, *LI); + if (CurLoop->getHeader() == BB) + NewLoop->moveToHeader(NewBB); + + // Add DominatorTree node. After seeing all blocks, update to correct + // IDom. + DT->addNewBlock(NewBB, BlocksDomBB); + + NewBlocks.push_back(NewBB); + } + + // Update DominatorTree. + for (BasicBlock *BB : Blocks) { + BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock(); + if (VMap.count(IDomBB)) + DT->changeImmediateDominator(cast(VMap[BB]), + cast(VMap[IDomBB])); + } + + // Move them physically from the end of the block list. + F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(), + NewFirstBlock->getIterator(), F->end()); +} + /* This method performs Unroll and Jam. For a simple loop like: for (i = ..) @@ -284,17 +367,7 @@ // Move any instructions from fore phi operands from AftBlocks into Fore. moveHeaderPhiOperandsToForeBlocks( - Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(), - AftBlocks); - - // The current on-the-fly SSA update requires blocks to be processed in - // reverse postorder so that LastValueMap contains the correct value at each - // exit. - LoopBlocksDFS DFS(L); - DFS.perform(LI); - // Stash the DFS iterators before adding blocks to the loop. - LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); - LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); + Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks); if (Header->getParent()->isDebugInfoForProfiling()) for (BasicBlock *BB : L->getBlocks()) @@ -312,76 +385,54 @@ // Copy all blocks for (unsigned It = 1; It != Count; ++It) { - std::vector NewBlocks; + SmallVector NewBlocks; // Maps Blocks[It] -> Blocks[It-1] DenseMap PrevItValueMap; - for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { + // Copy ForeBlocks. + { ValueToValueMapTy VMap; - BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); - Header->getParent()->getBasicBlockList().push_back(New); - - if (ForeBlocks.count(*BB)) { - L->addBasicBlockToLoop(New, *LI); - - if (*BB == ForeBlocksFirst[0]) - ForeBlocksFirst.push_back(New); - if (*BB == ForeBlocksLast[0]) - ForeBlocksLast.push_back(New); - } else if (SubLoopBlocks.count(*BB)) { - SubLoop->addBasicBlockToLoop(New, *LI); - - if (*BB == SubLoopBlocksFirst[0]) - SubLoopBlocksFirst.push_back(New); - if (*BB == SubLoopBlocksLast[0]) - SubLoopBlocksLast.push_back(New); - } else if (AftBlocks.count(*BB)) { - L->addBasicBlockToLoop(New, *LI); - - if (*BB == AftBlocksFirst[0]) - AftBlocksFirst.push_back(New); - if (*BB == AftBlocksLast[0]) - AftBlocksLast.push_back(New); - } else { - llvm_unreachable("BB being cloned should be in Fore/Sub/Aft"); + cloneBasicBlocksInLoop(SubLoopBlocksFirst[0], ForeBlocksLast[It - 1], + ForeBlocksFirst[0], ForeBlocks, VMap, + "." + Twine(It), LI, DT, NewBlocks); + ForeBlocksFirst.push_back(cast(VMap[ForeBlocksFirst[0]])); + ForeBlocksLast.push_back(cast(VMap[ForeBlocksLast[0]])); + for (auto VEntry : VMap) { + PrevItValueMap[VEntry->second] = const_cast( + It == 1 ? VEntry->first : LastValueMap[VEntry->first]); + LastValueMap[VEntry->first] = VEntry->second; } + } - // Update our running maps of newest clones - PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]); - LastValueMap[*BB] = New; - for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); - VI != VE; ++VI) { - PrevItValueMap[VI->second] = - const_cast(It == 1 ? VI->first : LastValueMap[VI->first]); - LastValueMap[VI->first] = VI->second; - } + // Clone SubLoopBlocks. + { + ValueToValueMapTy VMap; + cloneBasicBlocksInLoop(AftBlocksFirst[0], SubLoopBlocksLast[It - 1], + SubLoopBlocksFirst[0], SubLoopBlocks, VMap, + "." + Twine(It), LI, DT, NewBlocks); + SubLoopBlocksFirst.push_back( + cast(VMap[SubLoopBlocksFirst[0]])); + SubLoopBlocksLast.push_back(cast(VMap[SubLoopBlocksLast[0]])); + for (auto VEntry : VMap) + LastValueMap[VEntry->first] = VEntry->second; + } - NewBlocks.push_back(New); - - // Update DomTree: - if (*BB == ForeBlocksFirst[0]) - DT->addNewBlock(New, ForeBlocksLast[It - 1]); - else if (*BB == SubLoopBlocksFirst[0]) - DT->addNewBlock(New, SubLoopBlocksLast[It - 1]); - else if (*BB == AftBlocksFirst[0]) - DT->addNewBlock(New, AftBlocksLast[It - 1]); - else { - // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree - // structure. - auto BBDomNode = DT->getNode(*BB); - auto BBIDom = BBDomNode->getIDom(); - BasicBlock *OriginalBBIDom = BBIDom->getBlock(); - assert(OriginalBBIDom); - assert(LastValueMap[cast(OriginalBBIDom)]); - DT->addNewBlock( - New, cast(LastValueMap[cast(OriginalBBIDom)])); - } + // Clone AftBlocks. + { + ValueToValueMapTy VMap; + cloneBasicBlocksInLoop(LoopExit, AftBlocksLast[It - 1], AftBlocksFirst[0], + AftBlocks, VMap, "." + Twine(It), LI, DT, + NewBlocks); + AftBlocksFirst.push_back(cast(VMap[AftBlocksFirst[0]])); + AftBlocksLast.push_back(cast(VMap[AftBlocksLast[0]])); + for (auto VEntry : VMap) + LastValueMap[VEntry->first] = VEntry->second; } // Remap all instructions in the most recent iteration + remapInstructionsInBlocks(NewBlocks, LastValueMap); for (BasicBlock *NewBlock : NewBlocks) { for (Instruction &I : *NewBlock) { - ::remapInstruction(&I, LastValueMap); if (auto *II = dyn_cast(&I)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); @@ -447,8 +498,8 @@ // Update ForeBlocks successors and phi nodes BranchInst *ForeTerm = cast(ForeBlocksLast.back()->getTerminator()); - BasicBlock *Dest = SubLoopBlocksFirst[0]; - ForeTerm->setSuccessor(0, Dest); + assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); + ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]); if (CompletelyUnroll) { while (PHINode *Phi = dyn_cast(ForeBlocksFirst[0]->begin())) { @@ -465,8 +516,8 @@ // Remap ForeBlock successors from previous iteration to this BranchInst *ForeTerm = cast(ForeBlocksLast[It - 1]->getTerminator()); - BasicBlock *Dest = ForeBlocksFirst[It]; - ForeTerm->setSuccessor(0, Dest); + assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); + ForeTerm->setSuccessor(0, ForeBlocksFirst[It]); } // Subloop successors and phis @@ -495,13 +546,16 @@ } // Aft blocks successors and phis - BranchInst *Term = cast(AftBlocksLast.back()->getTerminator()); + BranchInst *AftTerm = cast(AftBlocksLast.back()->getTerminator()); if (CompletelyUnroll) { - BranchInst::Create(LoopExit, Term); - Term->eraseFromParent(); + BranchInst::Create(LoopExit, AftTerm); + AftTerm->eraseFromParent(); } else { - Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]); + AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]); + assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit && + "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit"); } + updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0], SubLoopBlocksLast.back()); @@ -518,24 +572,14 @@ movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]); } - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the // new ones required. - if (Count != 1) { - SmallVector DTUpdates; - DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0], - SubLoopBlocksFirst[0]); - DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, - SubLoopBlocksLast[0], AftBlocksFirst[0]); - - DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, - ForeBlocksLast.back(), SubLoopBlocksFirst[0]); - DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, - SubLoopBlocksLast.back(), AftBlocksFirst[0]); - DTU.applyUpdatesPermissive(DTUpdates); - } + DT->changeImmediateDominator(SubLoopBlocksFirst[0], ForeBlocksLast.back()); + DT->changeImmediateDominator(AftBlocksFirst[0], SubLoopBlocksLast.back()); + DT->changeImmediateDominator(LoopExit, AftBlocksLast.back()); // Merge adjacent basic blocks, if possible. + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); SmallPtrSet MergeBlocks; MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end()); MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end()); @@ -568,21 +612,24 @@ NumCompletelyUnrolledAndJammed += CompletelyUnroll; ++NumUnrolledAndJammed; + // Update LoopInfo if the loop is completely removed. + if (CompletelyUnroll) + LI->erase(L); + #ifndef NDEBUG // We shouldn't have done anything to break loop simplify form or LCSSA. Loop *OuterL = L->getParentLoop(); Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop); + assert(DT->verify()); + LI->verify(*DT); + assert(SubLoop->isRecursivelyLCSSAForm(*DT, *LI)); assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI)); if (!CompletelyUnroll) assert(L->isLoopSimplifyForm()); assert(SubLoop->isLoopSimplifyForm()); - assert(DT->verify()); + SE->verify(); #endif - // Update LoopInfo if the loop is completely removed. - if (CompletelyUnroll) - LI->erase(L); - return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled : LoopUnrollResult::PartiallyUnrolled; } diff --git a/llvm/test/Transforms/LoopUnrollAndJam/loopnest.ll b/llvm/test/Transforms/LoopUnrollAndJam/loopnest.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopUnrollAndJam/loopnest.ll @@ -0,0 +1,48 @@ +; RUN: opt -basicaa -tbaa -loop-unroll-and-jam -allow-unroll-and-jam -verify-loop-info < %s -S | FileCheck %s +; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='unroll-and-jam,verify' -allow-unroll-and-jam < %s -S | FileCheck %s + +; The explicit metadata should force loop-i to be unroll and jammed 4 times (hence the %inc.i.3) +; CHECK-LABEL: foo +; CHECK: %i = phi i64 [ 0, %entry ], [ %inc.i.3, %for.i.latch ] + +define void @foo(i64 %N, i32* %A) { +entry: + br label %for.i + +for.i: + %i = phi i64 [ 0, %entry ], [ %inc.i, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i64 [ 0, %for.i ], [ %inc.j, %for.j.latch ] + br label %for.k + +for.k: + %k = phi i64 [ 0, %for.j ], [ %inc.k, %for.k ] + %0 = mul nuw i64 %N, %N + %1 = mul nsw i64 %i, %0 + %Ai = getelementptr inbounds i32, i32* %A, i64 %1 + %2 = mul nsw i64 %j, %N + %Aij = getelementptr inbounds i32, i32* %Ai, i64 %2 + %Aijk = getelementptr inbounds i32, i32* %Aij, i64 %k + store i32 0, i32* %Aijk, align 4 + %inc.k = add nsw i64 %k, 1 + %cmp.k = icmp slt i64 %inc.k, 100 + br i1 %cmp.k, label %for.k, label %for.j.latch + +for.j.latch: + %inc.j = add nsw i64 %j, 1 + %cmp.j = icmp slt i64 %inc.j, 100 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %inc.i = add nsw i64 %i, 1 + %cmp.i = icmp slt i64 %inc.i, 100 + br i1 %cmp.i, label %for.i, label %for.end, !llvm.loop !1 + +for.end: + ret void +} + +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.unroll_and_jam.count", i32 4} diff --git a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll --- a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll +++ b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll @@ -368,16 +368,16 @@ ; CHECK: br label %for.inc ; CHECK: for.inc: ; CHECK: br i1 %tobool.11, label %for.cond4.1, label %for.inc.1 -; CHECK: for.latch: -; CHECK: br label %for.end -; CHECK: for.end: -; CHECK: ret i32 0 ; CHECK: for.cond4.1: ; CHECK: br i1 %tobool.1.1, label %for.cond4a.1, label %for.inc.1 ; CHECK: for.cond4a.1: ; CHECK: br label %for.inc.1 ; CHECK: for.inc.1: ; CHECK: br i1 %exitcond.1, label %for.latch, label %for.inner +; CHECK: for.latch: +; CHECK: br label %for.end +; CHECK: for.end: +; CHECK: ret i32 0 @a = hidden global [1 x i32] zeroinitializer, align 4 define i32 @test5() #0 { entry: