diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h --- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -110,7 +110,7 @@ Loop **EpilogueLoop = nullptr); bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, - DependenceInfo &DI); + DependenceInfo &DI, LoopInfo &LI); bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -282,21 +282,6 @@ ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, DependenceInfo &DI, OptimizationRemarkEmitter &ORE, int OptLevel) { - // Quick checks of the correct loop form - if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1) - return LoopUnrollResult::Unmodified; - Loop *SubLoop = L->getSubLoops()[0]; - if (!SubLoop->isLoopSimplifyForm()) - return LoopUnrollResult::Unmodified; - - BasicBlock *Latch = L->getLoopLatch(); - BasicBlock *Exit = L->getExitingBlock(); - BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); - BasicBlock *SubLoopExit = SubLoop->getExitingBlock(); - - if (Latch != Exit || SubLoopLatch != SubLoopExit) - return LoopUnrollResult::Unmodified; - TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None, None, None, None, None, None, None, None); @@ -326,7 +311,7 @@ return LoopUnrollResult::Unmodified; } - if (!isSafeToUnrollAndJam(L, SE, DT, DI)) { + if (!isSafeToUnrollAndJam(L, SE, DT, DI, *LI)) { LLVM_DEBUG(dbgs() << " Disabled due to not being safe.\n"); return LoopUnrollResult::Unmodified; } @@ -337,6 +322,7 @@ bool Convergent; SmallPtrSet EphValues; CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + Loop *SubLoop = L->getSubLoops()[0]; unsigned InnerLoopSize = ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable, Convergent, TTI, EphValues, UP.BEInsns); @@ -374,6 +360,8 @@ SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue()); // Find trip count and trip multiple + BasicBlock *Latch = L->getLoopLatch(); + BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch); unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch); unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch); diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp --- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -69,17 +70,14 @@ // Partition blocks in an outer/inner loop pair into blocks before and after // the loop -static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, - BasicBlockSet &ForeBlocks, - BasicBlockSet &SubLoopBlocks, - BasicBlockSet &AftBlocks, - DominatorTree *DT) { +static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks, + BasicBlockSet &AftBlocks, DominatorTree &DT) { + Loop *SubLoop = L.getSubLoops()[0]; BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); - SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); - for (BasicBlock *BB : L->blocks()) { + for (BasicBlock *BB : L.blocks()) { if (!SubLoop->contains(BB)) { - if (DT->dominates(SubLoopLatch, BB)) + if (DT.dominates(SubLoopLatch, BB)) AftBlocks.insert(BB); else ForeBlocks.insert(BB); @@ -93,14 +91,44 @@ if (BB == SubLoopPreHeader) continue; Instruction *TI = BB->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - if (!ForeBlocks.count(TI->getSuccessor(i))) + for (BasicBlock *Succ : successors(TI)) + if (!ForeBlocks.count(Succ)) return false; } return true; } +/// Partition blocks in a loop nest into blocks before and after each inner +/// loop. +static bool partitionOuterLoopBlocks( + Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks, + DenseMap &ForeBlocksMap, + DenseMap &AftBlocksMap, DominatorTree &DT) { + JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end()); + + for (Loop *L : Root.getLoopsInPreorder()) { + if (L == &JamLoop) + break; + + if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT)) + return false; + } + + return true; +} + +// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more +// than 2 levels loop. +static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, + BasicBlockSet &ForeBlocks, + BasicBlockSet &SubLoopBlocks, + BasicBlockSet &AftBlocks, + DominatorTree *DT) { + SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); + return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT); +} + // Looks at the phi nodes in Header for values coming from Latch. For these // instructions and all their operands calls Visit on them, keeping going for // all the operands in AftBlocks. Returns false if Visit returns false, @@ -642,7 +670,7 @@ static bool checkDependencies(SmallVector &Earlier, SmallVector &Later, - unsigned LoopDepth, bool InnerLoop, + unsigned LoopDepth, Loop *CurLoop, DependenceInfo &DI) { // Use DA to check for dependencies between loads and stores that make unroll // and jam invalid @@ -670,21 +698,43 @@ << " " << *Dst << "\n"); return false; } - if (!InnerLoop) { - if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) { + + // Allow accesses of different base. + if (LoopDepth != 1) + if (any_of(llvm::seq(1, LoopDepth), + [&D](unsigned CurLoopDepth) { + return !(D->getDirection(CurLoopDepth) & + Dependence::DVEntry::EQ); + })) + continue; + + if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) { + if (!CurLoop) { LLVM_DEBUG(dbgs() << " > dependency between:\n" << " " << *Src << "\n" << " " << *Dst << "\n"); return false; - } - } else { - assert(LoopDepth + 1 <= D->getLevels()); - if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT && - D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) { - LLVM_DEBUG(dbgs() << " < > dependency between:\n" - << " " << *Src << "\n" - << " " << *Dst << "\n"); - return false; + } else { + assert(CurLoop->getLoopDepth() <= D->getLevels()); + for (unsigned CurLoopDepth : llvm::seq( + LoopDepth + 1, CurLoop->getLoopDepth() + 1)) { + if (D->isScalar(CurLoopDepth)) + continue; + + // If the dependence direction is LT, then it is not safe to fuse. + if (D->getDirection(CurLoopDepth) & Dependence::DVEntry::LT) { + LLVM_DEBUG(dbgs() << " < > dependency between:\n" + << " " << *Src << "\n" + << " " << *Dst << "\n"); + return false; + } + // If the dependence direction is GT on a more outer loop, then it + // doesn't matter what the inner loop dependence direction is, as + // the whole dependence direction from (LoopDepth + 1) to + // InnerLoopDepth would be GT. + if (D->getDirection(CurLoopDepth) == Dependence::DVEntry::GT) + break; + } } } } @@ -693,44 +743,118 @@ return true; } -static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks, - BasicBlockSet &SubLoopBlocks, - BasicBlockSet &AftBlocks, DependenceInfo &DI) { - // Get all loads/store pairs for each blocks - SmallVector ForeMemInstr; - SmallVector SubLoopMemInstr; - SmallVector AftMemInstr; - if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) || - !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) || - !getLoadsAndStores(AftBlocks, AftMemInstr)) +static bool +checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks, + const DenseMap &ForeBlocksMap, + const DenseMap &AftBlocksMap, + DependenceInfo &DI, LoopInfo &LI) { + SmallVector AllBlocks; + for (Loop *L : Root.getLoopsInPreorder()) + if (ForeBlocksMap.find(L) != ForeBlocksMap.end()) + AllBlocks.push_back(ForeBlocksMap.lookup(L)); + AllBlocks.push_back(SubLoopBlocks); + for (Loop *L : Root.getLoopsInPreorder()) + if (AftBlocksMap.find(L) != AftBlocksMap.end()) + AllBlocks.push_back(AftBlocksMap.lookup(L)); + + unsigned LoopDepth = Root.getLoopDepth(); + auto *LastIterator = AllBlocks.end(); + --LastIterator; + for (auto *I = AllBlocks.begin(); I != LastIterator; ++I) { + SmallVector EarlierLoadsAndStores; + if (!getLoadsAndStores(*I, EarlierLoadsAndStores)) + return false; + + for (auto *J = I, *E = AllBlocks.end(); J != E; ++J) { + if (I == J && (*I == ForeBlocksMap.lookup(&Root) || + *I == AftBlocksMap.lookup(&Root))) + continue; + + SmallVector LaterLoadsAndStores; + if (!getLoadsAndStores(*J, LaterLoadsAndStores)) + return false; + + if (!checkDependencies( + EarlierLoadsAndStores, LaterLoadsAndStores, LoopDepth, + (I == J) ? LI.getLoopFor(*I->begin()) : nullptr, DI)) + return false; + } + } + + return true; +} + +static bool isEligibleLoopForm(const Loop &Root) { + // Root must have a child. + if (Root.getSubLoops().size() != 1) return false; - // Check for dependencies between any blocks that may change order - unsigned LoopDepth = L->getLoopDepth(); - return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false, - DI) && - checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) && - checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false, - DI) && - checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true, - DI); + const Loop *L = &Root; + do { + // All loops in Root need to be in simplify and rotated form. + if (!L->isLoopSimplifyForm()) + return false; + + if (!L->isRotatedForm()) + return false; + + if (L->getHeader()->hasAddressTaken()) { + LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); + return false; + } + + unsigned SubLoopsSize = L->getSubLoops().size(); + if (SubLoopsSize == 0) + return true; + + // Only one child is allowed. + if (SubLoopsSize != 1) + return false; + + L = L->getSubLoops()[0]; + } while (L); + + return true; +} + +static Loop *getInnerMostLoop(Loop *L) { + while (!L->getSubLoops().empty()) + L = L->getSubLoops()[0]; + return L; } bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, - DependenceInfo &DI) { + DependenceInfo &DI, LoopInfo &LI) { + if (!isEligibleLoopForm(*L)) { + LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n"); + return false; + } + /* We currently handle outer loops like this: | - ForeFirst <----\ } - Blocks | } ForeBlocks - ForeLast | } - | | - SubLoopFirst <\ | } - Blocks | | } SubLoopBlocks - SubLoopLast -/ | } - | | - AftFirst | } - Blocks | } AftBlocks - AftLast ------/ } + ForeFirst <------\ } + Blocks | } ForeBlocks of L + ForeLast | } + | | + ... | + | | + ForeFirst <----\ | } + Blocks | | } ForeBlocks of a inner loop of L + ForeLast | | } + | | | + JamLoopFirst <\ | | } + Blocks | | | } JamLoopBlocks of the innermost loop + JamLoopLast -/ | | } + | | | + AftFirst | | } + Blocks | | } AftBlocks of a inner loop of L + AftLast ------/ | } + | | + ... | + | | + AftFirst | } + Blocks | } AftBlocks of L + AftLast --------/ } | There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks @@ -740,14 +864,16 @@ things further in the profitablility checks of the unroll and jam pass. Because of the way we rearrange basic blocks, we also require that - the Fore blocks on all unrolled iterations are safe to move before the - SubLoop blocks of all iterations. So we require that the phi node looping - operands of ForeHeader can be moved to at least the end of ForeEnd, so that - we can arrange cloned Fore Blocks before the subloop and match up Phi's - correctly. + the Fore blocks of L on all unrolled iterations are safe to move before the + blocks of the direct child of L of all iterations. So we require that the + phi node looping operands of ForeHeader can be moved to at least the end of + ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and + match up Phi's correctly. - i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2. - It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2. + i.e. The old order of blocks used to be + (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2. + It needs to be safe to transform this to + (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2. There are then a number of checks along the lines of no calls, no exceptions, inner loop IV is consistent, etc. Note that for loops requiring @@ -755,35 +881,13 @@ UnrollAndJamLoop if the trip count cannot be easily calculated. */ - if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1) - return false; - Loop *SubLoop = L->getSubLoops()[0]; - if (!SubLoop->isLoopSimplifyForm()) - return false; - - BasicBlock *Header = L->getHeader(); - BasicBlock *Latch = L->getLoopLatch(); - BasicBlock *Exit = L->getExitingBlock(); - BasicBlock *SubLoopHeader = SubLoop->getHeader(); - BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); - BasicBlock *SubLoopExit = SubLoop->getExitingBlock(); - - if (Latch != Exit) - return false; - if (SubLoopLatch != SubLoopExit) - return false; - - if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) { - LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); - return false; - } - // Split blocks into Fore/SubLoop/Aft based on dominators + Loop *JamLoop = getInnerMostLoop(L); BasicBlockSet SubLoopBlocks; - BasicBlockSet ForeBlocks; - BasicBlockSet AftBlocks; - if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, - AftBlocks, &DT)) { + DenseMap ForeBlocksMap; + DenseMap AftBlocksMap; + if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap, + AftBlocksMap, DT)) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n"); return false; } @@ -791,7 +895,7 @@ // Aft blocks may need to move instructions to fore blocks, which becomes more // difficult if there are multiple (potentially conditionally executed) // blocks. For now we just exclude loops with multiple aft blocks. - if (AftBlocks.size() != 1) { + if (AftBlocksMap[L].size() != 1) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle " "multiple blocks after the loop\n"); return false; @@ -799,7 +903,9 @@ // Check inner loop backedge count is consistent on all iterations of the // outer loop - if (!hasIterationCountInvariantInParent(SubLoop, SE)) { + if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) { + return !hasIterationCountInvariantInParent(SubLoop, SE); + })) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is " "not consistent on each iteration\n"); return false; @@ -820,6 +926,10 @@ // ForeBlock phi operands before the subloop // Make sure we can move all instructions we need to before the subloop + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + BasicBlockSet AftBlocks = AftBlocksMap[L]; + Loop *SubLoop = L->getSubLoops()[0]; if (!processHeaderPhiOperands( Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) { if (SubLoop->contains(I->getParent())) @@ -845,7 +955,8 @@ // Check for memory dependencies which prohibit the unrolling we are doing. // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub. - if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) { + if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI, + LI)) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n"); return false; } diff --git a/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll b/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll --- a/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll +++ b/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll @@ -1,5 +1,5 @@ -; RUN: opt -basicaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s -; RUN: opt -aa-pipeline=basic-aa -passes='unroll-and-jam' -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s +; RUN: opt -da-disable-delinearization-checks -basicaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s +; RUN: opt -da-disable-delinearization-checks -aa-pipeline=basic-aa -passes='unroll-and-jam' -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" @@ -360,7 +360,7 @@ ; CHECK-LABEL: sub_sub_less ; CHECK: %j = phi ; CHECK-NOT: %j.1 = phi -define void @sub_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { +define void @sub_sub_less([100 x i32]* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 br i1 %cmp, label %for.outer, label %cleanup @@ -377,10 +377,11 @@ %mul = mul nsw i32 %0, %i %add = add nsw i32 %mul, %sum %add6 = add nuw nsw i32 %j, 1 - %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %i, i32 %j store i32 1, i32* %arrayidx, align 4 - %add72 = add nuw nsw i32 %i, -1 - %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72 + %add72 = add nuw nsw i32 %i, 1 + %add73 = add nuw nsw i32 %j, -1 + %arrayidx8 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %add72, i32 %add73 store i32 %add, i32* %arrayidx8, align 4 %exitcond = icmp eq i32 %add6, %N br i1 %exitcond, label %for.latch, label %for.inner @@ -398,7 +399,9 @@ ; CHECK-LABEL: sub_sub_eq ; CHECK: %j = phi ; CHECK: %j.1 = phi -define void @sub_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { +; CHECK: %j.2 = phi +; CHECK: %j.3 = phi +define void @sub_sub_eq([100 x i32]* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 br i1 %cmp, label %for.outer, label %cleanup @@ -415,10 +418,11 @@ %mul = mul nsw i32 %0, %i %add = add nsw i32 %mul, %sum %add6 = add nuw nsw i32 %j, 1 - %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %i, i32 %j store i32 1, i32* %arrayidx, align 4 - %add72 = add nuw nsw i32 %i, 0 - %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72 + %add72 = add nuw nsw i32 %i, 1 + %add73 = add nuw nsw i32 %j, 0 + %arrayidx8 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %add72, i32 %add73 store i32 %add, i32* %arrayidx8, align 4 %exitcond = icmp eq i32 %add6, %N br i1 %exitcond, label %for.latch, label %for.inner @@ -435,8 +439,10 @@ ; CHECK-LABEL: sub_sub_more ; CHECK: %j = phi -; CHECK-NOT: %j.1 = phi -define void @sub_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { +; CHECK: %j.1 = phi +; CHECK: %j.2 = phi +; CHECK: %j.3 = phi +define void @sub_sub_more([100 x i32]* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 br i1 %cmp, label %for.outer, label %cleanup @@ -453,10 +459,11 @@ %mul = mul nsw i32 %0, %i %add = add nsw i32 %mul, %sum %add6 = add nuw nsw i32 %j, 1 - %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %i, i32 %j store i32 1, i32* %arrayidx, align 4 %add72 = add nuw nsw i32 %i, 1 - %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72 + %add73 = add nuw nsw i32 %j, 1 + %arrayidx8 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %add72, i32 %add73 store i32 %add, i32* %arrayidx8, align 4 %exitcond = icmp eq i32 %add6, %N br i1 %exitcond, label %for.latch, label %for.inner @@ -469,3 +476,55 @@ cleanup: ret void } + +; CHECK-LABEL: sub_sub_less_3d +; CHECK: %k = phi +; CHECK-NOT: %k.1 = phi + +; for (long i = 0; i < 100; ++i) +; for (long j = 0; j < 100; ++j) +; for (long k = 0; k < 100; ++k) { +; A[i][j][k] = 0; +; A[i+1][j][k-1] = 0; +; } + +define void @sub_sub_less_3d([100 x [100 x i32]]* noalias %A) { +entry: + br label %for.i + +for.i: + %i = phi i32 [ 0, %entry ], [ %inc.i, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i32 [ 0, %for.i ], [ %inc.j, %for.j.latch ] + br label %for.k + +for.k: + %k = phi i32 [ 0, %for.j ], [ %inc.k, %for.k ] + %arrayidx = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* %A, i32 %i, i32 %j, i32 %k + store i32 0, i32* %arrayidx, align 4 + %add.i = add nsw i32 %i, 1 + %sub.k = add nsw i32 %k, -1 + %arrayidx2 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* %A, i32 %add.i, i32 %j, i32 %sub.k + store i32 0, i32* %arrayidx2, align 4 + %inc.k = add nsw i32 %k, 1 + %cmp.k = icmp slt i32 %inc.k, 100 + br i1 %cmp.k, label %for.k, label %for.j.latch + +for.j.latch: + %inc.j = add nsw i32 %j, 1 + %cmp.j = icmp slt i32 %inc.j, 100 + br i1 %cmp.j, label %for.j, label %for.i.latch, !llvm.loop !1 + +for.i.latch: + %inc.i = add nsw i32 %i, 1 + %cmp.i = icmp slt i32 %inc.i, 100 + br i1 %cmp.i, label %for.i, label %for.end + +for.end: + ret void +} + +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.unroll_and_jam.disable"}