diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h --- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -110,7 +110,7 @@ Loop **EpilogueLoop = nullptr); bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, - DependenceInfo &DI); + DependenceInfo &DI, LoopInfo &LI); bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -280,21 +280,6 @@ ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, DependenceInfo &DI, OptimizationRemarkEmitter &ORE, int OptLevel) { - // Quick checks of the correct loop form - if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1) - return LoopUnrollResult::Unmodified; - Loop *SubLoop = L->getSubLoops()[0]; - if (!SubLoop->isLoopSimplifyForm()) - return LoopUnrollResult::Unmodified; - - BasicBlock *Latch = L->getLoopLatch(); - BasicBlock *Exit = L->getExitingBlock(); - BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); - BasicBlock *SubLoopExit = SubLoop->getExitingBlock(); - - if (Latch != Exit || SubLoopLatch != SubLoopExit) - return LoopUnrollResult::Unmodified; - TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None, None, None, None, None, None, None, None); @@ -324,7 +309,7 @@ return LoopUnrollResult::Unmodified; } - if (!isSafeToUnrollAndJam(L, SE, DT, DI)) { + if (!isSafeToUnrollAndJam(L, SE, DT, DI, *LI)) { LLVM_DEBUG(dbgs() << " Disabled due to not being safe.\n"); return LoopUnrollResult::Unmodified; } @@ -335,6 +320,7 @@ bool Convergent; SmallPtrSet EphValues; CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + Loop *SubLoop = L->getSubLoops()[0]; unsigned InnerLoopSize = ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable, Convergent, TTI, EphValues, UP.BEInsns); @@ -372,6 +358,8 @@ SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue()); // Find trip count and trip multiple + BasicBlock *Latch = L->getLoopLatch(); + BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch); unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch); unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch); diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp --- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -69,17 +70,14 @@ // Partition blocks in an outer/inner loop pair into blocks before and after // the loop -static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, - BasicBlockSet &ForeBlocks, - BasicBlockSet &SubLoopBlocks, - BasicBlockSet &AftBlocks, - DominatorTree *DT) { +static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks, + BasicBlockSet &AftBlocks, DominatorTree &DT) { + Loop *SubLoop = L.getSubLoops()[0]; BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); - SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); - for (BasicBlock *BB : L->blocks()) { + for (BasicBlock *BB : L.blocks()) { if (!SubLoop->contains(BB)) { - if (DT->dominates(SubLoopLatch, BB)) + if (DT.dominates(SubLoopLatch, BB)) AftBlocks.insert(BB); else ForeBlocks.insert(BB); @@ -93,14 +91,44 @@ if (BB == SubLoopPreHeader) continue; Instruction *TI = BB->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - if (!ForeBlocks.count(TI->getSuccessor(i))) + for (BasicBlock *Succ : successors(TI)) + if (!ForeBlocks.count(Succ)) return false; } return true; } +/// Partition blocks in a loop nest into blocks before and after each inner +/// loop. +static bool partitionOuterLoopBlocks( + Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks, + DenseMap &ForeBlocksMap, + DenseMap &AftBlocksMap, DominatorTree &DT) { + JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end()); + + for (Loop *L : Root.getLoopsInPreorder()) { + if (L == &JamLoop) + break; + + if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT)) + return false; + } + + return true; +} + +// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more +// than 2 levels loop. +static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, + BasicBlockSet &ForeBlocks, + BasicBlockSet &SubLoopBlocks, + BasicBlockSet &AftBlocks, + DominatorTree *DT) { + SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); + return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT); +} + // Looks at the phi nodes in Header for values coming from Latch. For these // instructions and all their operands calls Visit on them, keeping going for // all the operands in AftBlocks. Returns false if Visit returns false, @@ -619,7 +647,7 @@ } static bool getLoadsAndStores(BasicBlockSet &Blocks, - SmallVector &MemInstr) { + SmallVector &MemInstr) { // Scan the BBs and collect legal loads and stores. // Returns false if non-simple loads/stores are found. for (BasicBlock *BB : Blocks) { @@ -640,97 +668,235 @@ return true; } -static bool checkDependencies(SmallVector &Earlier, - SmallVector &Later, - unsigned LoopDepth, bool InnerLoop, - DependenceInfo &DI) { - // Use DA to check for dependencies between loads and stores that make unroll - // and jam invalid - for (Value *I : Earlier) { - for (Value *J : Later) { - Instruction *Src = cast(I); - Instruction *Dst = cast(J); - if (Src == Dst) - continue; - // Ignore Input dependencies. - if (isa(Src) && isa(Dst)) - continue; - - // Track dependencies, and if we find them take a conservative approach - // by allowing only = or < (not >), altough some > would be safe - // (depending upon unroll width). - // For the inner loop, we need to disallow any (> <) dependencies - // FIXME: Allow > so long as distance is less than unroll width - if (auto D = DI.depends(Src, Dst, true)) { - assert(D->isOrdered() && "Expected an output, flow or anti dep."); - - if (D->isConfused()) { - LLVM_DEBUG(dbgs() << " Confused dependency between:\n" - << " " << *Src << "\n" - << " " << *Dst << "\n"); +static bool preservesForwardDependence(Instruction *Src, Instruction *Dst, + unsigned UnrollLevel, unsigned JamLevel, + bool Sequentialized, Dependence *D) { + // UnrollLevel might carry the dependency Src --> Dst + // Does a different loop after unrolling? + for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; + ++CurLoopDepth) { + auto JammedDir = D->getDirection(CurLoopDepth); + if (JammedDir == Dependence::DVEntry::LT) + return true; + + if (JammedDir & Dependence::DVEntry::GT) + return false; + } + + return true; +} + +static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst, + unsigned UnrollLevel, unsigned JamLevel, + bool Sequentialized, Dependence *D) { + // UnrollLevel might carry the dependency Dst --> Src + for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; + ++CurLoopDepth) { + auto JammedDir = D->getDirection(CurLoopDepth); + if (JammedDir == Dependence::DVEntry::GT) + return true; + + if (JammedDir & Dependence::DVEntry::LT) + return false; + } + + // Backward dependencies are only preserved if not interleaved. + return Sequentialized; +} + +// Check whether it is semantically safe Src and Dst considering any potential +// dependency between them. +// +// @param UnrollLevel The level of the loop being unrolled +// @param JamLevel The level of the loop being jammed; if Src and Dst are on +// different levels, the outermost common loop counts as jammed level +// +// @return true if is safe and false if there is a dependency violation. +static bool checkDependency(Instruction *Src, Instruction *Dst, + unsigned UnrollLevel, unsigned JamLevel, + bool Sequentialized, DependenceInfo &DI) { + assert(UnrollLevel <= JamLevel && + "Expecting JamLevel to be at least UnrollLevel"); + + if (Src == Dst) + return true; + // Ignore Input dependencies. + if (isa(Src) && isa(Dst)) + return true; + + // Check whether unroll-and-jam may violate a dependency. + // By construction, every dependency will be lexicographically non-negative + // (if it was, it would violate the current execution order), such as + // (0,0,>,*,*) + // Unroll-and-jam changes the GT execution of two executions to the same + // iteration of the chosen unroll level. That is, a GT dependence becomes a GE + // dependence (or EQ, if we fully unrolled the loop) at the loop's position: + // (0,0,>=,*,*) + // Now, the dependency is not necessarily non-negative anymore, i.e. + // unroll-and-jam may violate correctness. + std::unique_ptr D = DI.depends(Src, Dst, true); + if (!D) + return true; + assert(D->isOrdered() && "Expected an output, flow or anti dep."); + + if (D->isConfused()) { + LLVM_DEBUG(dbgs() << " Confused dependency between:\n" + << " " << *Src << "\n" + << " " << *Dst << "\n"); + return false; + } + + // If outer levels (levels enclosing the loop being unroll-and-jammed) have a + // non-equal direction, then the locations accessed in the inner levels cannot + // overlap in memory. We assumes the indexes never overlap into neighboring + // dimensions. + for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth) + if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ)) + return true; + + auto UnrollDirection = D->getDirection(UnrollLevel); + + // If the distance carried by the unrolled loop is 0, then after unrolling + // that distance will become non-zero resulting in non-overlapping accesses in + // the inner loops. + if (UnrollDirection == Dependence::DVEntry::EQ) + return true; + + if (UnrollDirection & Dependence::DVEntry::LT && + !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel, + Sequentialized, D.get())) + return false; + + if (UnrollDirection & Dependence::DVEntry::GT && + !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel, + Sequentialized, D.get())) + return false; + + return true; +} + +static bool +checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks, + const DenseMap &ForeBlocksMap, + const DenseMap &AftBlocksMap, + DependenceInfo &DI, LoopInfo &LI) { + SmallVector AllBlocks; + for (Loop *L : Root.getLoopsInPreorder()) + if (ForeBlocksMap.find(L) != ForeBlocksMap.end()) + AllBlocks.push_back(ForeBlocksMap.lookup(L)); + AllBlocks.push_back(SubLoopBlocks); + for (Loop *L : Root.getLoopsInPreorder()) + if (AftBlocksMap.find(L) != AftBlocksMap.end()) + AllBlocks.push_back(AftBlocksMap.lookup(L)); + + unsigned LoopDepth = Root.getLoopDepth(); + SmallVector EarlierLoadsAndStores; + SmallVector CurrentLoadsAndStores; + for (BasicBlockSet &Blocks : AllBlocks) { + CurrentLoadsAndStores.clear(); + if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores)) + return false; + + Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent()); + unsigned CurLoopDepth = CurLoop->getLoopDepth(); + + for (auto *Earlier : EarlierLoadsAndStores) { + Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent()); + unsigned EarlierDepth = EarlierLoop->getLoopDepth(); + unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth); + for (auto *Later : CurrentLoadsAndStores) { + if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false, + DI)) + return false; + } + } + + size_t NumInsts = CurrentLoadsAndStores.size(); + for (size_t I = 0; I < NumInsts; ++I) { + for (size_t J = I; J < NumInsts; ++J) { + if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J], + LoopDepth, CurLoopDepth, true, DI)) return false; - } - if (!InnerLoop) { - if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) { - LLVM_DEBUG(dbgs() << " > dependency between:\n" - << " " << *Src << "\n" - << " " << *Dst << "\n"); - return false; - } - } else { - assert(LoopDepth + 1 <= D->getLevels()); - if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT && - D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) { - LLVM_DEBUG(dbgs() << " < > dependency between:\n" - << " " << *Src << "\n" - << " " << *Dst << "\n"); - return false; - } - } } } + + EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(), + CurrentLoadsAndStores.end()); } return true; } -static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks, - BasicBlockSet &SubLoopBlocks, - BasicBlockSet &AftBlocks, DependenceInfo &DI) { - // Get all loads/store pairs for each blocks - SmallVector ForeMemInstr; - SmallVector SubLoopMemInstr; - SmallVector AftMemInstr; - if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) || - !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) || - !getLoadsAndStores(AftBlocks, AftMemInstr)) +static bool isEligibleLoopForm(const Loop &Root) { + // Root must have a child. + if (Root.getSubLoops().size() != 1) return false; - // Check for dependencies between any blocks that may change order - unsigned LoopDepth = L->getLoopDepth(); - return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false, - DI) && - checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) && - checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false, - DI) && - checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true, - DI); + const Loop *L = &Root; + do { + // All loops in Root need to be in simplify and rotated form. + if (!L->isLoopSimplifyForm()) + return false; + + if (!L->isRotatedForm()) + return false; + + if (L->getHeader()->hasAddressTaken()) { + LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); + return false; + } + + unsigned SubLoopsSize = L->getSubLoops().size(); + if (SubLoopsSize == 0) + return true; + + // Only one child is allowed. + if (SubLoopsSize != 1) + return false; + + L = L->getSubLoops()[0]; + } while (L); + + return true; +} + +static Loop *getInnerMostLoop(Loop *L) { + while (!L->getSubLoops().empty()) + L = L->getSubLoops()[0]; + return L; } bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, - DependenceInfo &DI) { + DependenceInfo &DI, LoopInfo &LI) { + if (!isEligibleLoopForm(*L)) { + LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n"); + return false; + } + /* We currently handle outer loops like this: | - ForeFirst <----\ } - Blocks | } ForeBlocks - ForeLast | } - | | - SubLoopFirst <\ | } - Blocks | | } SubLoopBlocks - SubLoopLast -/ | } - | | - AftFirst | } - Blocks | } AftBlocks - AftLast ------/ } + ForeFirst <------\ } + Blocks | } ForeBlocks of L + ForeLast | } + | | + ... | + | | + ForeFirst <----\ | } + Blocks | | } ForeBlocks of a inner loop of L + ForeLast | | } + | | | + JamLoopFirst <\ | | } + Blocks | | | } JamLoopBlocks of the innermost loop + JamLoopLast -/ | | } + | | | + AftFirst | | } + Blocks | | } AftBlocks of a inner loop of L + AftLast ------/ | } + | | + ... | + | | + AftFirst | } + Blocks | } AftBlocks of L + AftLast --------/ } | There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks @@ -740,14 +906,16 @@ things further in the profitablility checks of the unroll and jam pass. Because of the way we rearrange basic blocks, we also require that - the Fore blocks on all unrolled iterations are safe to move before the - SubLoop blocks of all iterations. So we require that the phi node looping - operands of ForeHeader can be moved to at least the end of ForeEnd, so that - we can arrange cloned Fore Blocks before the subloop and match up Phi's - correctly. + the Fore blocks of L on all unrolled iterations are safe to move before the + blocks of the direct child of L of all iterations. So we require that the + phi node looping operands of ForeHeader can be moved to at least the end of + ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and + match up Phi's correctly. - i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2. - It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2. + i.e. The old order of blocks used to be + (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2. + It needs to be safe to transform this to + (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2. There are then a number of checks along the lines of no calls, no exceptions, inner loop IV is consistent, etc. Note that for loops requiring @@ -755,35 +923,13 @@ UnrollAndJamLoop if the trip count cannot be easily calculated. */ - if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1) - return false; - Loop *SubLoop = L->getSubLoops()[0]; - if (!SubLoop->isLoopSimplifyForm()) - return false; - - BasicBlock *Header = L->getHeader(); - BasicBlock *Latch = L->getLoopLatch(); - BasicBlock *Exit = L->getExitingBlock(); - BasicBlock *SubLoopHeader = SubLoop->getHeader(); - BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); - BasicBlock *SubLoopExit = SubLoop->getExitingBlock(); - - if (Latch != Exit) - return false; - if (SubLoopLatch != SubLoopExit) - return false; - - if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) { - LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); - return false; - } - // Split blocks into Fore/SubLoop/Aft based on dominators + Loop *JamLoop = getInnerMostLoop(L); BasicBlockSet SubLoopBlocks; - BasicBlockSet ForeBlocks; - BasicBlockSet AftBlocks; - if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, - AftBlocks, &DT)) { + DenseMap ForeBlocksMap; + DenseMap AftBlocksMap; + if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap, + AftBlocksMap, DT)) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n"); return false; } @@ -791,7 +937,7 @@ // Aft blocks may need to move instructions to fore blocks, which becomes more // difficult if there are multiple (potentially conditionally executed) // blocks. For now we just exclude loops with multiple aft blocks. - if (AftBlocks.size() != 1) { + if (AftBlocksMap[L].size() != 1) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle " "multiple blocks after the loop\n"); return false; @@ -799,7 +945,9 @@ // Check inner loop backedge count is consistent on all iterations of the // outer loop - if (!hasIterationCountInvariantInParent(SubLoop, SE)) { + if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) { + return !hasIterationCountInvariantInParent(SubLoop, SE); + })) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is " "not consistent on each iteration\n"); return false; @@ -820,6 +968,10 @@ // ForeBlock phi operands before the subloop // Make sure we can move all instructions we need to before the subloop + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + BasicBlockSet AftBlocks = AftBlocksMap[L]; + Loop *SubLoop = L->getSubLoops()[0]; if (!processHeaderPhiOperands( Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) { if (SubLoop->contains(I->getParent())) @@ -845,7 +997,8 @@ // Check for memory dependencies which prohibit the unrolling we are doing. // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub. - if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) { + if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI, + LI)) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n"); return false; } diff --git a/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll b/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll --- a/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll +++ b/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll @@ -398,6 +398,8 @@ ; CHECK-LABEL: sub_sub_eq ; CHECK: %j = phi ; CHECK: %j.1 = phi +; CHECK: %j.2 = phi +; CHECK: %j.3 = phi define void @sub_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { entry: %cmp = icmp sgt i32 %N, 0 diff --git a/llvm/test/Transforms/LoopUnrollAndJam/dependencies_multidims.ll b/llvm/test/Transforms/LoopUnrollAndJam/dependencies_multidims.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopUnrollAndJam/dependencies_multidims.ll @@ -0,0 +1,219 @@ +; RUN: opt -da-disable-delinearization-checks -basicaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s +; RUN: opt -da-disable-delinearization-checks -aa-pipeline=basic-aa -passes='unroll-and-jam' -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + +; CHECK-LABEL: sub_sub_less +; CHECK: %j = phi +; CHECK-NOT: %j.1 = phi +define void @sub_sub_less([100 x i32]* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { +entry: + %cmp = icmp sgt i32 %N, 0 + br i1 %cmp, label %for.outer, label %cleanup + +for.outer: + %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + br label %for.inner + +for.inner: + %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ] + %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ] + %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j + %0 = load i32, i32* %arrayidx5, align 4 + %mul = mul nsw i32 %0, %i + %add = add nsw i32 %mul, %sum + %add6 = add nuw nsw i32 %j, 1 + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %i, i32 %j + store i32 1, i32* %arrayidx, align 4 + %add72 = add nuw nsw i32 %i, 1 + %add73 = add nuw nsw i32 %j, -1 + %arrayidx8 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %add72, i32 %add73 + store i32 %add, i32* %arrayidx8, align 4 + %exitcond = icmp eq i32 %add6, %N + br i1 %exitcond, label %for.latch, label %for.inner + +for.latch: + %add7 = add nuw nsw i32 %i, 1 + %exitcond29 = icmp eq i32 %add7, %N + br i1 %exitcond29, label %cleanup, label %for.outer + +cleanup: + ret void +} + + +; CHECK-LABEL: sub_sub_eq +; CHECK: %j = phi +; CHECK: %j.1 = phi +; CHECK: %j.2 = phi +; CHECK: %j.3 = phi +define void @sub_sub_eq([100 x i32]* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { +entry: + %cmp = icmp sgt i32 %N, 0 + br i1 %cmp, label %for.outer, label %cleanup + +for.outer: + %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + br label %for.inner + +for.inner: + %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ] + %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ] + %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j + %0 = load i32, i32* %arrayidx5, align 4 + %mul = mul nsw i32 %0, %i + %add = add nsw i32 %mul, %sum + %add6 = add nuw nsw i32 %j, 1 + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %i, i32 %j + store i32 1, i32* %arrayidx, align 4 + %add72 = add nuw nsw i32 %i, 1 + %add73 = add nuw nsw i32 %j, 0 + %arrayidx8 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %add72, i32 %add73 + store i32 %add, i32* %arrayidx8, align 4 + %exitcond = icmp eq i32 %add6, %N + br i1 %exitcond, label %for.latch, label %for.inner + +for.latch: + %add7 = add nuw nsw i32 %i, 1 + %exitcond29 = icmp eq i32 %add7, %N + br i1 %exitcond29, label %cleanup, label %for.outer + +cleanup: + ret void +} + + +; CHECK-LABEL: sub_sub_more +; CHECK: %j = phi +; CHECK: %j.1 = phi +; CHECK: %j.2 = phi +; CHECK: %j.3 = phi +define void @sub_sub_more([100 x i32]* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) { +entry: + %cmp = icmp sgt i32 %N, 0 + br i1 %cmp, label %for.outer, label %cleanup + +for.outer: + %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ] + br label %for.inner + +for.inner: + %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ] + %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ] + %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j + %0 = load i32, i32* %arrayidx5, align 4 + %mul = mul nsw i32 %0, %i + %add = add nsw i32 %mul, %sum + %add6 = add nuw nsw i32 %j, 1 + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %i, i32 %j + store i32 1, i32* %arrayidx, align 4 + %add72 = add nuw nsw i32 %i, 1 + %add73 = add nuw nsw i32 %j, 1 + %arrayidx8 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i32 %add72, i32 %add73 + store i32 %add, i32* %arrayidx8, align 4 + %exitcond = icmp eq i32 %add6, %N + br i1 %exitcond, label %for.latch, label %for.inner + +for.latch: + %add7 = add nuw nsw i32 %i, 1 + %exitcond29 = icmp eq i32 %add7, %N + br i1 %exitcond29, label %cleanup, label %for.outer + +cleanup: + ret void +} + +; CHECK-LABEL: sub_sub_less_3d +; CHECK: %k = phi +; CHECK-NOT: %k.1 = phi + +; for (long i = 0; i < 100; ++i) +; for (long j = 0; j < 100; ++j) +; for (long k = 0; k < 100; ++k) { +; A[i][j][k] = 0; +; A[i+1][j][k-1] = 0; +; } + +define void @sub_sub_less_3d([100 x [100 x i32]]* noalias %A) { +entry: + br label %for.i + +for.i: + %i = phi i32 [ 0, %entry ], [ %inc.i, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i32 [ 0, %for.i ], [ %inc.j, %for.j.latch ] + br label %for.k + +for.k: + %k = phi i32 [ 0, %for.j ], [ %inc.k, %for.k ] + %arrayidx = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* %A, i32 %i, i32 %j, i32 %k + store i32 0, i32* %arrayidx, align 4 + %add.i = add nsw i32 %i, 1 + %sub.k = add nsw i32 %k, -1 + %arrayidx2 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* %A, i32 %add.i, i32 %j, i32 %sub.k + store i32 0, i32* %arrayidx2, align 4 + %inc.k = add nsw i32 %k, 1 + %cmp.k = icmp slt i32 %inc.k, 100 + br i1 %cmp.k, label %for.k, label %for.j.latch + +for.j.latch: + %inc.j = add nsw i32 %j, 1 + %cmp.j = icmp slt i32 %inc.j, 100 + br i1 %cmp.j, label %for.j, label %for.i.latch, !llvm.loop !1 + +for.i.latch: + %inc.i = add nsw i32 %i, 1 + %cmp.i = icmp slt i32 %inc.i, 100 + br i1 %cmp.i, label %for.i, label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: sub_sub_outer_scalar +; CHECK: %k = phi +; CHECK-NOT: %k.1 = phi + +define void @sub_sub_outer_scalar([100 x i32]* %A) { +entry: + br label %for.i + +for.i: + %i = phi i64 [ 0, %entry ], [ %inc.i, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i64 [ 0, %for.i ], [ %inc.j, %for.j.latch ] + br label %for.k + +for.k: + %k = phi i64 [ 0, %for.j ], [ %inc.k, %for.k ] + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 %j + %arrayidx7 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx, i64 0, i64 %k + %0 = load i32, i32* %arrayidx7, align 4 + %sub.j = sub nsw i64 %j, 1 + %arrayidx8 = getelementptr inbounds [100 x i32], [100 x i32]* %A, i64 %sub.j + %arrayidx9 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx8, i64 0, i64 %k + store i32 %0, i32* %arrayidx9, align 4 + %inc.k = add nsw i64 %k, 1 + %cmp.k = icmp slt i64 %inc.k, 100 + br i1 %cmp.k, label %for.k, label %for.j.latch + +for.j.latch: + %inc.j = add nsw i64 %j, 1 + %cmp.j = icmp slt i64 %inc.j, 100 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %inc.i = add nsw i64 %i, 1 + %cmp.i = icmp slt i64 %inc.i, 100 + br i1 %cmp.i, label %for.i, label %for.end + +for.end: + ret void +} + +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.unroll_and_jam.disable"}