diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp --- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp @@ -135,6 +135,89 @@ } } +/// Clone a single entry single exit set of blocks \p Blocks. \p FirstBlock is +/// expected to be the single entry of \p Blocks. \p Blocks is expected to be +/// contained in a loop. +/// +/// Updates LoopInfo and DominatorTree assuming \p Blocks are dominated by block +/// \p BlocksDomBB. Insert the new blocks before block specified in \p Before. +static void cloneBasicBlocksInLoop(BasicBlock *Before, BasicBlock *BlocksDomBB, + BasicBlock *FirstBlock, + SmallPtrSetImpl &Blocks, + ValueToValueMapTy &VMap, + const Twine &NameSuffix, LoopInfo *LI, + DominatorTree *DT, + SmallVectorImpl &NewBlocks) { + assert(!Blocks.empty() && "Expecting non-empty Blocks"); + assert(Blocks.count(FirstBlock) && + "Expecting FirstBlock to be part of Blocks"); + Function *F = FirstBlock->getParent(); + Loop *L = LI->getLoopFor(FirstBlock); + assert(L && "Expecting FirstBlock to be in a loop"); + DenseMap LMap; + LMap[L] = L; + + BasicBlock *NewFirstBlock = CloneBasicBlock(FirstBlock, VMap, NameSuffix, F); + VMap[FirstBlock] = NewFirstBlock; + L->addBasicBlockToLoop(NewFirstBlock, *LI); + DT->addNewBlock(NewFirstBlock, BlocksDomBB); + NewBlocks.push_back(NewFirstBlock); + + // Allocate loops that are descendants of L, and contained in Blocks. + for (Loop *CurLoop : L->getLoopsInPreorder()) { + if (!Blocks.count(CurLoop->getHeader())) + continue; + + Loop *&NewLoop = LMap[CurLoop]; + if (!NewLoop) { + NewLoop = LI->AllocateLoop(); + + // Establish the parent/child relationship. + Loop *OrigParent = CurLoop->getParentLoop(); + assert(OrigParent && "Could not find the original parent loop"); + Loop *NewParentLoop = LMap[OrigParent]; + assert(NewParentLoop && "Could not find the new parent loop"); + + NewParentLoop->addChildLoop(NewLoop); + } + } + + // Clone Blocks. + for (BasicBlock *BB : Blocks) { + if (BB == FirstBlock) + continue; + + Loop *CurLoop = LI->getLoopFor(BB); + assert(CurLoop && "Expecting BB to be in a loop"); + Loop *NewLoop = LMap[CurLoop]; + assert(NewLoop && "Expecting new loop to be allocated"); + BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F); + VMap[BB] = NewBB; + + NewLoop->addBasicBlockToLoop(NewBB, *LI); + if (CurLoop->getHeader() == BB) + NewLoop->moveToHeader(NewBB); + + // Add DominatorTree node. After seeing all blocks, update to correct + // IDom. + DT->addNewBlock(NewBB, BlocksDomBB); + + NewBlocks.push_back(NewBB); + } + + // Update DominatorTree. + for (BasicBlock *BB : Blocks) { + BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock(); + if (VMap.count(IDomBB)) + DT->changeImmediateDominator(cast(VMap[BB]), + cast(VMap[IDomBB])); + } + + // Move them physically from the end of the block list. + F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(), + NewFirstBlock->getIterator(), F->end()); +} + /* This method performs Unroll and Jam. For a simple loop like: for (i = ..) @@ -286,15 +369,6 @@ moveHeaderPhiOperandsToForeBlocks( Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks); - // The current on-the-fly SSA update requires blocks to be processed in - // reverse postorder so that LastValueMap contains the correct value at each - // exit. - LoopBlocksDFS DFS(L); - DFS.perform(LI); - // Stash the DFS iterators before adding blocks to the loop. - LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); - LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); - if (Header->getParent()->isDebugInfoForProfiling()) for (BasicBlock *BB : L->getBlocks()) for (Instruction &I : *BB) @@ -315,66 +389,44 @@ // Maps Blocks[It] -> Blocks[It-1] DenseMap PrevItValueMap; - for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { + // Copy ForeBlocks. + { ValueToValueMapTy VMap; - BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); - Header->getParent()->getBasicBlockList().push_back(New); - - if (ForeBlocks.count(*BB)) { - L->addBasicBlockToLoop(New, *LI); - - if (*BB == ForeBlocksFirst[0]) - ForeBlocksFirst.push_back(New); - if (*BB == ForeBlocksLast[0]) - ForeBlocksLast.push_back(New); - } else if (SubLoopBlocks.count(*BB)) { - SubLoop->addBasicBlockToLoop(New, *LI); - - if (*BB == SubLoopBlocksFirst[0]) - SubLoopBlocksFirst.push_back(New); - if (*BB == SubLoopBlocksLast[0]) - SubLoopBlocksLast.push_back(New); - } else if (AftBlocks.count(*BB)) { - L->addBasicBlockToLoop(New, *LI); - - if (*BB == AftBlocksFirst[0]) - AftBlocksFirst.push_back(New); - if (*BB == AftBlocksLast[0]) - AftBlocksLast.push_back(New); - } else { - llvm_unreachable("BB being cloned should be in Fore/Sub/Aft"); + cloneBasicBlocksInLoop(SubLoopBlocksFirst[0], ForeBlocksLast[It - 1], + ForeBlocksFirst[0], ForeBlocks, VMap, + "." + Twine(It), LI, DT, NewBlocks); + ForeBlocksFirst.push_back(cast(VMap[ForeBlocksFirst[0]])); + ForeBlocksLast.push_back(cast(VMap[ForeBlocksLast[0]])); + for (auto VEntry : VMap) { + PrevItValueMap[VEntry->second] = const_cast( + It == 1 ? VEntry->first : LastValueMap[VEntry->first]); + LastValueMap[VEntry->first] = VEntry->second; } + } - // Update our running maps of newest clones - PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]); - LastValueMap[*BB] = New; - for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); - VI != VE; ++VI) { - PrevItValueMap[VI->second] = - const_cast(It == 1 ? VI->first : LastValueMap[VI->first]); - LastValueMap[VI->first] = VI->second; - } + // Clone SubLoopBlocks. + { + ValueToValueMapTy VMap; + cloneBasicBlocksInLoop(AftBlocksFirst[0], SubLoopBlocksLast[It - 1], + SubLoopBlocksFirst[0], SubLoopBlocks, VMap, + "." + Twine(It), LI, DT, NewBlocks); + SubLoopBlocksFirst.push_back( + cast(VMap[SubLoopBlocksFirst[0]])); + SubLoopBlocksLast.push_back(cast(VMap[SubLoopBlocksLast[0]])); + for (auto VEntry : VMap) + LastValueMap[VEntry->first] = VEntry->second; + } - NewBlocks.push_back(New); - - // Update DomTree: - if (*BB == ForeBlocksFirst[0]) - DT->addNewBlock(New, ForeBlocksLast[It - 1]); - else if (*BB == SubLoopBlocksFirst[0]) - DT->addNewBlock(New, SubLoopBlocksLast[It - 1]); - else if (*BB == AftBlocksFirst[0]) - DT->addNewBlock(New, AftBlocksLast[It - 1]); - else { - // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree - // structure. - auto BBDomNode = DT->getNode(*BB); - auto BBIDom = BBDomNode->getIDom(); - BasicBlock *OriginalBBIDom = BBIDom->getBlock(); - assert(OriginalBBIDom); - assert(LastValueMap[cast(OriginalBBIDom)]); - DT->addNewBlock( - New, cast(LastValueMap[cast(OriginalBBIDom)])); - } + // Clone AftBlocks. + { + ValueToValueMapTy VMap; + cloneBasicBlocksInLoop(LoopExit, AftBlocksLast[It - 1], AftBlocksFirst[0], + AftBlocks, VMap, "." + Twine(It), LI, DT, + NewBlocks); + AftBlocksFirst.push_back(cast(VMap[AftBlocksFirst[0]])); + AftBlocksLast.push_back(cast(VMap[AftBlocksLast[0]])); + for (auto VEntry : VMap) + LastValueMap[VEntry->first] = VEntry->second; } // Remap all instructions in the most recent iteration diff --git a/llvm/test/Transforms/LoopUnrollAndJam/loopnest.ll b/llvm/test/Transforms/LoopUnrollAndJam/loopnest.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopUnrollAndJam/loopnest.ll @@ -0,0 +1,130 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -basicaa -tbaa -loop-unroll-and-jam -allow-unroll-and-jam -verify-loop-info < %s -S | FileCheck %s +; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='unroll-and-jam,verify' -allow-unroll-and-jam < %s -S | FileCheck %s + +; The explicit metadata should force loop-i to be unroll and jammed 4 times (hence the %inc.i.3) + +define void @foo(i64 %N, i32* %A) { +; CHECK-LABEL: @foo +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_I:%.*]] +; CHECK: for.i: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_3:%.*]], [[FOR_I_LATCH:%.*]] ] +; CHECK-NEXT: [[INC_I:%.*]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[INC_I_1:%.*]] = add nuw nsw i64 [[INC_I]], 1 +; CHECK-NEXT: [[INC_I_2:%.*]] = add nuw nsw i64 [[INC_I_1]], 1 +; CHECK-NEXT: [[INC_I_3]] = add nuw nsw i64 [[INC_I_2]], 1 +; CHECK-NEXT: br label [[FOR_J:%.*]] +; CHECK: for.j: +; CHECK-NEXT: [[J:%.*]] = phi i64 [ 0, [[FOR_I]] ], [ [[INC_J:%.*]], [[FOR_J_LATCH_3:%.*]] ] +; CHECK-NEXT: [[J_1:%.*]] = phi i64 [ 0, [[FOR_I]] ], [ [[INC_J_1:%.*]], [[FOR_J_LATCH_3]] ] +; CHECK-NEXT: [[J_2:%.*]] = phi i64 [ 0, [[FOR_I]] ], [ [[INC_J_2:%.*]], [[FOR_J_LATCH_3]] ] +; CHECK-NEXT: [[J_3:%.*]] = phi i64 [ 0, [[FOR_I]] ], [ [[INC_J_3:%.*]], [[FOR_J_LATCH_3]] ] +; CHECK-NEXT: br label [[FOR_K:%.*]] +; CHECK: for.k: +; CHECK-NEXT: [[K:%.*]] = phi i64 [ 0, [[FOR_J]] ], [ [[INC_K:%.*]], [[FOR_K]] ] +; CHECK-NEXT: [[TMP0:%.*]] = mul nuw i64 [[N:%.*]], [[N]] +; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i64 [[I]], [[TMP0]] +; CHECK-NEXT: [[AI:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw i64 [[J]], [[N]] +; CHECK-NEXT: [[AIJ:%.*]] = getelementptr inbounds i32, i32* [[AI]], i64 [[TMP2]] +; CHECK-NEXT: [[AIJK:%.*]] = getelementptr inbounds i32, i32* [[AIJ]], i64 [[K]] +; CHECK-NEXT: store i32 0, i32* [[AIJK]], align 4 +; CHECK-NEXT: [[INC_K]] = add nsw i64 [[K]], 1 +; CHECK-NEXT: [[CMP_K:%.*]] = icmp slt i64 [[INC_K]], 300 +; CHECK-NEXT: br i1 [[CMP_K]], label [[FOR_K]], label [[FOR_J_LATCH:%.*]] +; CHECK: for.j.latch: +; CHECK-NEXT: [[INC_J]] = add nuw nsw i64 [[J]], 1 +; CHECK-NEXT: br label [[FOR_K_1:%.*]] +; CHECK: for.k.1: +; CHECK-NEXT: [[K_1:%.*]] = phi i64 [ 0, [[FOR_J_LATCH]] ], [ [[INC_K_1:%.*]], [[FOR_K_1]] ] +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[N]], [[N]] +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i64 [[INC_I]], [[TMP3]] +; CHECK-NEXT: [[AI_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = mul nsw i64 [[J_1]], [[N]] +; CHECK-NEXT: [[AIJ_1:%.*]] = getelementptr inbounds i32, i32* [[AI_1]], i64 [[TMP5]] +; CHECK-NEXT: [[AIJK_1:%.*]] = getelementptr inbounds i32, i32* [[AIJ_1]], i64 [[K_1]] +; CHECK-NEXT: store i32 0, i32* [[AIJK_1]], align 4 +; CHECK-NEXT: [[INC_K_1]] = add nsw i64 [[K_1]], 1 +; CHECK-NEXT: [[CMP_K_1:%.*]] = icmp slt i64 [[INC_K_1]], 300 +; CHECK-NEXT: br i1 [[CMP_K_1]], label [[FOR_K_1]], label [[FOR_J_LATCH_1:%.*]] +; CHECK: for.j.latch.1: +; CHECK-NEXT: [[INC_J_1]] = add nuw nsw i64 [[J_1]], 1 +; CHECK-NEXT: br label [[FOR_K_2:%.*]] +; CHECK: for.k.2: +; CHECK-NEXT: [[K_2:%.*]] = phi i64 [ 0, [[FOR_J_LATCH_1]] ], [ [[INC_K_2:%.*]], [[FOR_K_2]] ] +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw i64 [[N]], [[N]] +; CHECK-NEXT: [[TMP7:%.*]] = mul nsw i64 [[INC_I_1]], [[TMP6]] +; CHECK-NEXT: [[AI_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = mul nsw i64 [[J_2]], [[N]] +; CHECK-NEXT: [[AIJ_2:%.*]] = getelementptr inbounds i32, i32* [[AI_2]], i64 [[TMP8]] +; CHECK-NEXT: [[AIJK_2:%.*]] = getelementptr inbounds i32, i32* [[AIJ_2]], i64 [[K_2]] +; CHECK-NEXT: store i32 0, i32* [[AIJK_2]], align 4 +; CHECK-NEXT: [[INC_K_2]] = add nsw i64 [[K_2]], 1 +; CHECK-NEXT: [[CMP_K_2:%.*]] = icmp slt i64 [[INC_K_2]], 300 +; CHECK-NEXT: br i1 [[CMP_K_2]], label [[FOR_K_2]], label [[FOR_J_LATCH_2:%.*]] +; CHECK: for.j.latch.2: +; CHECK-NEXT: [[INC_J_2]] = add nuw nsw i64 [[J_2]], 1 +; CHECK-NEXT: br label [[FOR_K_3:%.*]] +; CHECK: for.k.3: +; CHECK-NEXT: [[K_3:%.*]] = phi i64 [ 0, [[FOR_J_LATCH_2]] ], [ [[INC_K_3:%.*]], [[FOR_K_3]] ] +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[N]], [[N]] +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw i64 [[INC_I_2]], [[TMP9]] +; CHECK-NEXT: [[AI_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw i64 [[J_3]], [[N]] +; CHECK-NEXT: [[AIJ_3:%.*]] = getelementptr inbounds i32, i32* [[AI_3]], i64 [[TMP11]] +; CHECK-NEXT: [[AIJK_3:%.*]] = getelementptr inbounds i32, i32* [[AIJ_3]], i64 [[K_3]] +; CHECK-NEXT: store i32 0, i32* [[AIJK_3]], align 4 +; CHECK-NEXT: [[INC_K_3]] = add nsw i64 [[K_3]], 1 +; CHECK-NEXT: [[CMP_K_3:%.*]] = icmp slt i64 [[INC_K_3]], 300 +; CHECK-NEXT: br i1 [[CMP_K_3]], label [[FOR_K_3]], label [[FOR_J_LATCH_3]] +; CHECK: for.j.latch.3: +; CHECK-NEXT: [[INC_J_3]] = add nuw nsw i64 [[J_3]], 1 +; CHECK-NEXT: [[CMP_J_3:%.*]] = icmp ult i64 [[INC_J_3]], 200 +; CHECK-NEXT: br i1 [[CMP_J_3]], label [[FOR_J]], label [[FOR_I_LATCH]] +; CHECK: for.i.latch: +; CHECK-NEXT: [[CMP_I_3:%.*]] = icmp ult i64 [[INC_I_3]], 100 +; CHECK-NEXT: br i1 [[CMP_I_3]], label [[FOR_I]], label [[FOR_END:%.*]], !llvm.loop !0 +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.i + +for.i: + %i = phi i64 [ 0, %entry ], [ %inc.i, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i64 [ 0, %for.i ], [ %inc.j, %for.j.latch ] + br label %for.k + +for.k: + %k = phi i64 [ 0, %for.j ], [ %inc.k, %for.k ] + %0 = mul nuw i64 %N, %N + %1 = mul nsw i64 %i, %0 + %Ai = getelementptr inbounds i32, i32* %A, i64 %1 + %2 = mul nsw i64 %j, %N + %Aij = getelementptr inbounds i32, i32* %Ai, i64 %2 + %Aijk = getelementptr inbounds i32, i32* %Aij, i64 %k + store i32 0, i32* %Aijk, align 4 + %inc.k = add nsw i64 %k, 1 + %cmp.k = icmp slt i64 %inc.k, 300 + br i1 %cmp.k, label %for.k, label %for.j.latch + +for.j.latch: + %inc.j = add nsw i64 %j, 1 + %cmp.j = icmp slt i64 %inc.j, 200 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %inc.i = add nsw i64 %i, 1 + %cmp.i = icmp slt i64 %inc.i, 100 + br i1 %cmp.i, label %for.i, label %for.end, !llvm.loop !1 + +for.end: + ret void +} + +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.unroll_and_jam.count", i32 4} diff --git a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll --- a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll +++ b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll @@ -368,16 +368,16 @@ ; CHECK: br label %for.inc ; CHECK: for.inc: ; CHECK: br i1 %tobool.11, label %for.cond4.1, label %for.inc.1 +; CHECK-DAG: for.cond4.1: +; CHECK-DAG: br i1 %tobool.1.1, label %for.cond4a.1, label %for.inc.1 +; CHECK-DAG: for.cond4a.1: +; CHECK-DAG: br label %for.inc.1 +; CHECK-DAG: for.inc.1: +; CHECK-DAG: br i1 %exitcond.1, label %for.latch, label %for.inner ; CHECK: for.latch: ; CHECK: br label %for.end ; CHECK: for.end: ; CHECK: ret i32 0 -; CHECK: for.cond4.1: -; CHECK: br i1 %tobool.1.1, label %for.cond4a.1, label %for.inc.1 -; CHECK: for.cond4a.1: -; CHECK: br label %for.inc.1 -; CHECK: for.inc.1: -; CHECK: br i1 %exitcond.1, label %for.latch, label %for.inner @a = hidden global [1 x i32] zeroinitializer, align 4 define i32 @test5() #0 { entry: