Index: include/llvm/Transforms/Utils/UnrollLoop.h =================================================================== --- include/llvm/Transforms/Utils/UnrollLoop.h +++ include/llvm/Transforms/Utils/UnrollLoop.h @@ -34,10 +34,11 @@ LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA); -bool UnrollRuntimeLoopProlog(Loop *L, unsigned Count, - bool AllowExpensiveTripCount, LoopInfo *LI, - ScalarEvolution *SE, DominatorTree *DT, - bool PreserveLCSSA); +bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, + bool AllowExpensiveTripCount, + bool UseEpilogRemainder, LoopInfo *LI, + ScalarEvolution *SE, DominatorTree *DT, + bool PreserveLCSSA); MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name); } Index: lib/Transforms/Utils/LoopUnroll.cpp =================================================================== --- lib/Transforms/Utils/LoopUnroll.cpp +++ lib/Transforms/Utils/LoopUnroll.cpp @@ -44,6 +44,11 @@ STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled"); STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)"); +static cl::opt +UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(true), cl::Hidden, + cl::desc("Allow runtime unrolled loops to be unrolled " + "with epilog instead of prolog.")); + /// Convert the instruction operands from referencing the current values into /// those specified by VMap. static inline void remapInstruction(Instruction *I, @@ -288,12 +293,13 @@ "convergent " "operation."); }); - // Don't output the runtime loop prolog if Count is a multiple of + // Don't output the runtime loop remainder if Count is a multiple of // TripMultiple. Such a prolog is never needed, and is unsafe if the loop // contains a convergent instruction. if (RuntimeTripCount && TripMultiple % Count != 0 && - !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, SE, DT, - PreserveLCSSA)) + !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount, + UnrollRuntimeEpilog, LI, SE, DT, + PreserveLCSSA)) return false; // Notify ScalarEvolution that the loop will be substantially changed, Index: lib/Transforms/Utils/LoopUnrollRuntime.cpp =================================================================== --- lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -16,8 +16,8 @@ // case, we need to generate code to execute these 'left over' iterations. // // The current strategy generates an if-then-else sequence prior to the -// unrolled loop to execute the 'left over' iterations. Other strategies -// include generate a loop before or after the unrolled loop. +// unrolled loop to execute the 'left over' iterations before or after the +// unrolled loop. // //===----------------------------------------------------------------------===// @@ -60,33 +60,33 @@ /// than the unroll factor. /// static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, - BasicBlock *LastPrologBB, BasicBlock *PrologEnd, - BasicBlock *OrigPH, BasicBlock *NewPH, - ValueToValueMapTy &VMap, DominatorTree *DT, - LoopInfo *LI, bool PreserveLCSSA) { + BasicBlock *PrologExit, BasicBlock *PreHeader, + BasicBlock *NewPreHeader, ValueToValueMapTy &VMap, + DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); + BasicBlock *PrologLatch = cast(VMap[Latch]); // Create a PHI node for each outgoing value from the original loop // (which means it is an outgoing value from the prolog code too). // The new PHI node is inserted in the prolog end basic block. - // The new PHI name is added as an operand of a PHI node in either + // The new PHI node value is added as an operand of a PHI node in either // the loop header or the loop exit block. - for (succ_iterator SBI = succ_begin(Latch), SBE = succ_end(Latch); - SBI != SBE; ++SBI) { - for (BasicBlock::iterator BBI = (*SBI)->begin(); + for (BasicBlock *Succ : successors(Latch)) { + for (BasicBlock::iterator BBI = Succ->begin(); PHINode *PN = dyn_cast(BBI); ++BBI) { // Add a new PHI node to the prolog end block and add the // appropriate incoming values. - PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName()+".unr", - PrologEnd->getTerminator()); + PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr", + PrologExit->getFirstNonPHI()); // Adding a value to the new PHI node from the original loop preheader. // This is the value that skips all the prolog code. if (L->contains(PN)) { - NewPN->addIncoming(PN->getIncomingValueForBlock(NewPH), OrigPH); + NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), + PreHeader); } else { - NewPN->addIncoming(UndefValue::get(PN->getType()), OrigPH); + NewPN->addIncoming(UndefValue::get(PN->getType()), PreHeader); } Value *V = PN->getIncomingValueForBlock(Latch); @@ -97,22 +97,22 @@ } // Adding a value to the new PHI node from the last prolog block // that was created. - NewPN->addIncoming(V, LastPrologBB); + NewPN->addIncoming(V, PrologLatch); // Update the existing PHI node operand with the value from the // new PHI node. How this is done depends on if the existing // PHI node is in the original loop block, or the exit block. if (L->contains(PN)) { - PN->setIncomingValue(PN->getBasicBlockIndex(NewPH), NewPN); + PN->setIncomingValue(PN->getBasicBlockIndex(NewPreHeader), NewPN); } else { - PN->addIncoming(NewPN, PrologEnd); + PN->addIncoming(NewPN, PrologExit); } } } // Create a branch around the original loop, which is taken if there are no // iterations remaining to be executed after running the prologue. - Instruction *InsertPt = PrologEnd->getTerminator(); + Instruction *InsertPt = PrologExit->getTerminator(); IRBuilder<> B(InsertPt); assert(Count != 0 && "nonsensical Count!"); @@ -126,25 +126,142 @@ BasicBlock *Exit = L->getUniqueExitBlock(); assert(Exit && "Loop must have a single exit block only"); // Split the exit to maintain loop canonicalization guarantees - SmallVector Preds(pred_begin(Exit), pred_end(Exit)); + SmallVector Preds(predecessors(Exit)); SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", DT, LI, PreserveLCSSA); // Add the branch to the exit block (around the unrolled loop) - B.CreateCondBr(BrLoopExit, Exit, NewPH); + B.CreateCondBr(BrLoopExit, Exit, NewPreHeader); + InsertPt->eraseFromParent(); +} + +/// Connect the unrolling epilog code to the original loop. +/// The unrolling epilog code contains code to execute the +/// 'extra' iterations if the run-time trip count modulo the +/// unroll count is non-zero. +/// +/// This function performs the following: +/// - Update PHI nodes at the unrolling loop exit and epilog loop exit +/// - Create PHI nodes at the unrolling loop exit to combine +/// values that exit the unrolling loop code and jump around it. +/// - Update PHI operands in the epilog loop by the new PHI nodes +/// - Branch around the epilog loop if extra iters (ModVal) is zero. +/// +static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, + BasicBlock *Exit, BasicBlock *PreHeader, + BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader, + ValueToValueMapTy &VMap, DominatorTree *DT, + LoopInfo *LI, bool PreserveLCSSA) { + BasicBlock *Latch = L->getLoopLatch(); + assert(Latch && "Loop must have a latch"); + BasicBlock *EpilogLatch = cast(VMap[Latch]); + + // Loop structure should be the following: + // + // PreHeader + // NewPreHeader + // Header + // ... + // Latch + // NewExit (PN) + // EpilogPreHeader + // EpilogHeader + // ... + // EpilogLatch + // Exit (EpilogPN) + + // Update PHI nodes at NewExit and Exit. + for (BasicBlock::iterator BBI = NewExit->begin(); + PHINode *PN = dyn_cast(BBI); ++BBI) { + // PN should be used in another PHI located in Exit block as + // Exit was split by SplitBlockPredecessors into Exit and NewExit + // Basicaly it should look like: + // NewExit: + // PN = PHI [I, Latch] + // ... + // Exit: + // EpilogPN = PHI [PN, EpilogPreHeader] + // + // There is EpilogPreHeader incoming block instead of NewExit as + // NewExit was spilt 1 more time to get EpilogPreHeader. + assert(PN->hasOneUse() && "The phi should have 1 use"); + PHINode *EpilogPN = cast (PN->use_begin()->getUser()); + assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block"); + + // Add incoming PreHeader from branch around the Loop + PN->addIncoming(UndefValue::get(PN->getType()), PreHeader); + + Value *V = PN->getIncomingValueForBlock(Latch); + Instruction *I = dyn_cast(V); + if (I && L->contains(I)) + // If value comes from an instruction in the loop add VMap value. + V = VMap[I]; + // For the instruction out of the loop, constant or undefined value + // insert value itself. + EpilogPN->addIncoming(V, EpilogLatch); + + assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 && + "EpilogPN should have EpilogPreHeader incoming block"); + // Change EpilogPreHeader incoming block to NewExit. + EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader), + NewExit); + // Now PHIs should look like: + // NewExit: + // PN = PHI [I, Latch], [undef, PreHeader] + // ... + // Exit: + // EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch] + } + + // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader). + // Update corresponding PHI nodes in epilog loop. + for (BasicBlock *Succ : successors(Latch)) { + // Skip this as we already updated phis in exit blocks. + if (!L->contains(Succ)) + continue; + for (BasicBlock::iterator BBI = Succ->begin(); + PHINode *PN = dyn_cast(BBI); ++BBI) { + // Add new PHI nodes to the loop exit block and update epilog + // PHIs with the new PHI values. + PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr", + NewExit->getFirstNonPHI()); + // Adding a value to the new PHI node from the unrolling loop preheader. + NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), PreHeader); + // Adding a value to the new PHI node from the unrolling loop latch. + NewPN->addIncoming(PN->getIncomingValueForBlock(Latch), Latch); + + // Update the existing PHI node operand with the value from the new PHI + // node. Corresponding instruction in epilog loop should be PHI. + PHINode *VPN = cast(VMap[&*BBI]); + VPN->setIncomingValue(VPN->getBasicBlockIndex(EpilogPreHeader), NewPN); + } + } + + Instruction *InsertPt = NewExit->getTerminator(); + IRBuilder<> B(InsertPt); + Value *BrLoopExit = B.CreateIsNotNull(ModVal); + assert(Exit && "Loop must have a single exit block only"); + // Split the exit to maintain loop canonicalization guarantees + SmallVector Preds(predecessors(Exit)); + SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, + PreserveLCSSA); + // Add the branch to the exit block (around the unrolling loop) + B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit); InsertPt->eraseFromParent(); } /// Create a clone of the blocks in a loop and connect them together. -/// If UnrollProlog is true, loop structure will not be cloned, otherwise a new -/// loop will be created including all cloned blocks, and the iterator of it +/// If CreateLoop is false, loop structure will not be cloned, otherwise a +/// new loop will be created including all cloned blocks, and the iterator of it /// switches to count NewIter down to 0. /// -static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, +static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateLoop, + const bool UseEpilogRemainder, BasicBlock *InsertTop, BasicBlock *InsertBot, + BasicBlock *Preheader, std::vector &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, LoopInfo *LI) { - BasicBlock *Preheader = L->getLoopPreheader(); + StringRef suffix = UseEpilogRemainder ? "epil" : "prol"; BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); Function *F = Header->getParent(); @@ -152,7 +269,7 @@ LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO(); Loop *NewLoop = nullptr; Loop *ParentLoop = L->getParentLoop(); - if (!UnrollProlog) { + if (CreateLoop) { NewLoop = new Loop(); if (ParentLoop) ParentLoop->addChildLoop(NewLoop); @@ -163,7 +280,7 @@ // For each block in the original loop, create a new copy, // and update the value map with the newly created values. for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { - BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".prol", F); + BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F); NewBlocks.push_back(NewBB); if (NewLoop) @@ -179,16 +296,17 @@ } if (Latch == *BB) { - // For the last block, if UnrollProlog is true, create a direct jump to + // For the last block, if CreateLoop is false, create a direct jump to // InsertBot. If not, create a loop back to cloned head. VMap.erase((*BB)->getTerminator()); BasicBlock *FirstLoopBB = cast(VMap[Header]); BranchInst *LatchBR = cast(NewBB->getTerminator()); IRBuilder<> Builder(LatchBR); - if (UnrollProlog) { + if (!CreateLoop) { Builder.CreateBr(InsertBot); } else { - PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, "prol.iter", + PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, + suffix + ".iter", FirstLoopBB->getFirstNonPHI()); Value *IdxSub = Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1), @@ -207,9 +325,15 @@ // cloned loop. for (BasicBlock::iterator I = Header->begin(); isa(I); ++I) { PHINode *NewPHI = cast(VMap[&*I]); - if (UnrollProlog) { - VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader); - cast(VMap[Header])->getInstList().erase(NewPHI); + if (!CreateLoop) { + if (UseEpilogRemainder) { + unsigned idx = NewPHI->getBasicBlockIndex(Preheader); + NewPHI->setIncomingBlock(idx, InsertTop); + NewPHI->removeIncomingValue(Latch, false); + } else { + VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader); + cast(VMap[Header])->getInstList().erase(NewPHI); + } } else { unsigned idx = NewPHI->getBasicBlockIndex(Preheader); NewPHI->setIncomingBlock(idx, InsertTop); @@ -254,7 +378,7 @@ } } -/// Insert code in the prolog code when unrolling a loop with a +/// Insert code in the prolog/epilog code when unrolling a loop with a /// run-time trip-count. /// /// This method assumes that the loop unroll factor is total number @@ -266,6 +390,7 @@ /// instruction in SimplifyCFG.cpp. Then, the backend decides how code for /// the switch instruction is generated. /// +/// ***Prolog case*** /// extraiters = tripcount % loopfactor /// if (extraiters == 0) jump Loop: /// else jump Prol @@ -277,17 +402,35 @@ /// ... /// End: /// -bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, - bool AllowExpensiveTripCount, LoopInfo *LI, - ScalarEvolution *SE, DominatorTree *DT, - bool PreserveLCSSA) { - // For now, only unroll loops that contain a single exit. +/// ***Epilog case*** +/// unroll_iters = tripcount - tripcount % loopfactor +/// if (unroll_iters == 0) jump LoopExit: +/// Loop: LoopBody; (executes unroll_iter times); +/// unroll_iter -= 1; +/// if (unroll_iter != 0) jump Loop: +/// LoopExit: +/// extraiters = tripcount % loopfactor +/// if (extraiters == 0) jump EpilExit: +/// Epil: LoopBody; (executes extraiters times) +/// extraiters -= 1 // Omitted if unroll factor is 2. +/// if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2. +/// EpilExit: + +bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, + bool AllowExpensiveTripCount, + bool UseEpilogRemainder, + LoopInfo *LI, ScalarEvolution *SE, + DominatorTree *DT, bool PreserveLCSSA) { + // for now, only unroll loops that contain a single exit if (!L->getExitingBlock()) return false; // Make sure the loop is in canonical form, and there is a single // exit block only. - if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock()) + if (!L->isLoopSimplifyForm()) + return false; + BasicBlock *Exit = L->getUniqueExitBlock(); // successor out of loop + if (!Exit) return false; // Use Scalar Evolution to compute the trip count. This allows more loops to @@ -311,8 +454,8 @@ return false; BasicBlock *Header = L->getHeader(); - BasicBlock *PH = L->getLoopPreheader(); - BranchInst *PreHeaderBR = cast(PH->getTerminator()); + BasicBlock *PreHeader = L->getLoopPreheader(); + BranchInst *PreHeaderBR = cast(PreHeader->getTerminator()); const DataLayout &DL = Header->getModule()->getDataLayout(); SCEVExpander Expander(*SE, DL, "loop-unroll"); if (!AllowExpensiveTripCount && @@ -335,24 +478,77 @@ SE->forgetLoop(ParentLoop); BasicBlock *Latch = L->getLoopLatch(); - // It helps to split the original preheader twice, one for the end of the - // prolog code and one for a new loop preheader. - BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI); - BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI); - PreHeaderBR = cast(PH->getTerminator()); + // Loop structure is the following: + // + // PreHeader + // Header + // ... + // Latch + // Exit + + BasicBlock *NewPreHeader; + BasicBlock *NewExit = nullptr; + BasicBlock *PrologExit = nullptr; + BasicBlock *EpilogPreHeader = nullptr; + + if (UseEpilogRemainder) { + // If epilog remainder + // Split PreHeader to insert a branch around loop for unrolling. + NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI); + NewPreHeader->setName(PreHeader->getName() + ".new"); + + // Split Exit to create phi nodes from branch above. + SmallVector Preds(predecessors(Exit)); + NewExit = SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", + DT, LI, PreserveLCSSA); + // Split NewExit to insert epilog remainder loop. + EpilogPreHeader = SplitBlock(NewExit, NewExit->getTerminator(), DT, LI); + EpilogPreHeader->setName(Header->getName() + ".epil.preheader"); + // Loop structure should be the following: + // + // PreHeader + // *NewPreHeader + // Header + // ... + // Latch + // *NewExit + // *EpilogPreHeader + // Exit + } else { + // If prolog remainder + // Split the original preheader twice, one for the end of the + // prolog code and one for a new loop preheader. + PrologExit = SplitEdge(PreHeader, Header, DT, LI); + PrologExit->setName(Header->getName() + ".prol.loopexit"); + NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI); + NewPreHeader->setName(PreHeader->getName() + ".new"); + // Loop structure should be the following: + // PreHeader + // *PrologExit + // *NewPreHeader + // Header + // ... + // Latch + // Exit + } + + // Calculate conditions for branch around loop for unrolling + // in epilog case and around prolog remainder loop in prolog case. // Compute the number of extra iterations required, which is: - // extra iterations = run-time trip count % (loop unroll factor + 1) + // extra iterations = run-time trip count % loop unroll factor + PreHeaderBR = cast(PreHeader->getTerminator()); Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), PreHeaderBR); - Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(), + Value *BECount = UseEpilogRemainder ? nullptr : + Expander.expandCodeFor(BECountSC, BECountSC->getType(), PreHeaderBR); IRBuilder<> B(PreHeaderBR); Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); // If ModVal is zero, we know that either - // 1. There are no iterations to be run in the prologue loop. + // 1. There are no iterations to be run in the prolog/epilog loop. // OR // 2. The addition computing TripCount overflowed. // @@ -361,18 +557,26 @@ // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we // explicitly check this above). - Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod"); - - // Branch to either the extra iterations or the cloned/unrolled loop. - // We will fix up the true branch label when adding loop body copies. - B.CreateCondBr(BranchVal, PEnd, PEnd); - assert(PreHeaderBR->isUnconditional() && - PreHeaderBR->getSuccessor(0) == PEnd && - "CFG edges in Preheader are not correct"); + Value *TestVal = ModVal; + if (UseEpilogRemainder) + TestVal = B.CreateSub(TripCount, ModVal, "unroll_iter"); + Value *BranchVal = B.CreateIsNotNull(TestVal, "lcmp.mod"); + + if (UseEpilogRemainder) { + // Branch to either the extra iterations or the cloned/unrolling loop + B.CreateCondBr(BranchVal, NewPreHeader, NewExit); + } else { + // Branch to either the extra iterations or the cloned/unrolling loop + // We will fix up the true branch label when adding loop body copies + B.CreateCondBr(BranchVal, PrologExit, PrologExit); + assert(PreHeaderBR->isUnconditional() && + PreHeaderBR->getSuccessor(0) == PrologExit && + "CFG edges in Preheader are not correct"); + } PreHeaderBR->eraseFromParent(); Function *F = Header->getParent(); // Get an ordered list of blocks in the loop to help with the ordering of the - // cloned blocks in the prolog code. + // cloned blocks in the prolog/epilog code LoopBlocksDFS LoopBlocks(L); LoopBlocks.perform(LI); @@ -384,17 +588,38 @@ std::vector NewBlocks; ValueToValueMapTy VMap; - bool UnrollPrologue = Count == 2; + // For unroll factor 2 remainder loop will have 1 iterations. + // Do not create 1 iteration loop. + bool CreateLoop = (Count != 2); // Clone all the basic blocks in the loop. If Count is 2, we don't clone // the loop, otherwise we create a cloned loop to execute the extra // iterations. This function adds the appropriate CFG connections. - CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks, - VMap, LI); + BasicBlock *InsertBot = UseEpilogRemainder ? Exit : PrologExit; + BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PreHeader; + CloneLoopBlocks(L, ModVal, CreateLoop, UseEpilogRemainder, InsertTop, + InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, LI); + + // Insert the cloned blocks into the function. + F->getBasicBlockList().splice(InsertBot->getIterator(), + F->getBasicBlockList(), + NewBlocks[0]->getIterator(), + F->end()); - // Insert the cloned blocks into the function just before the original loop. - F->getBasicBlockList().splice(PEnd->getIterator(), F->getBasicBlockList(), - NewBlocks[0]->getIterator(), F->end()); + // Loop structure should be the following: + // Epilog Prolog + // + // PreHeader + // NewPreHeader PreHeader + // Header PrologHeader + // ... ... + // Latch PrologLatch + // NewExit PrologExit + // EpilogPreHeader NewPreHeader + // EpilogHeader Header + // ... ... + // EpilogLatch Latch + // Exit Exit // Rewrite the cloned instruction operands to use the values created when the // clone is created. @@ -405,11 +630,36 @@ } } - // Connect the prolog code to the original loop and update the - // PHI functions. - BasicBlock *LastLoopBB = cast(VMap[Latch]); - ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, DT, LI, - PreserveLCSSA); + if (UseEpilogRemainder) { + // Connect the epilog code to the original loop and update the + // PHI functions. + ConnectEpilog(L, ModVal, NewExit, Exit, PreHeader, + EpilogPreHeader, NewPreHeader, VMap, DT, LI, + PreserveLCSSA); + + // Update counter in loop for unrolling. + // I should be multiply of Count. + BranchInst *LatchBR = cast(Latch->getTerminator()); + IRBuilder<> B2(LatchBR); + PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter", + Header->getFirstNonPHI()); + Value *IdxSub = + B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1), + NewIdx->getName() + ".nsub"); + Value *IdxCmp; + if (LatchBR->getSuccessor(0) == Header) + IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp"); + else + IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp"); + NewIdx->addIncoming(TestVal, NewPreHeader); + NewIdx->addIncoming(IdxSub, Latch); + LatchBR->setCondition(IdxCmp); + } else { + // Connect the prolog code to the original loop and update the + // PHI functions. + ConnectProlog(L, BECount, Count, PrologExit, PreHeader, NewPreHeader, + VMap, DT, LI, PreserveLCSSA); + } NumRuntimeUnrolled++; return true; } Index: test/Transforms/LoopUnroll/AArch64/runtime-loop.ll =================================================================== --- test/Transforms/LoopUnroll/AArch64/runtime-loop.ll +++ test/Transforms/LoopUnroll/AArch64/runtime-loop.ll @@ -1,13 +1,22 @@ -; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s -check-prefix=EPILOG +; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG ; Tests for unrolling loops with run-time trip counts -; CHECK: %xtraiter = and i32 %n -; CHECK: %lcmp.mod = icmp ne i32 %xtraiter, 0 -; CHECK: br i1 %lcmp.mod, label %for.body.prol, label %for.body.preheader.split +; EPILOG: %xtraiter = and i32 %n +; EPILOG: %unroll_iter = sub i32 %n, %xtraiter +; EPILOG: %lcmp.mod = icmp ne i32 %unroll_iter, 0 +; EPILOG: br i1 %lcmp.mod, label %for.body.preheader.new, label %for.end.loopexit.unr-lcssa + +; PROLOG: %xtraiter = and i32 %n +; PROLOG: %lcmp.mod = icmp ne i32 %xtraiter, 0 +; PROLOG: br i1 %lcmp.mod, label %for.body.prol, label %for.body.prol.loopexit -; CHECK: for.body.prol: -; CHECK: for.body: +; EPILOG: for.body: +; EPILOG: for.body.epil: + +; PROLOG: for.body.prol: +; PROLOG: for.body: define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly { entry: Index: test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll =================================================================== --- test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll +++ test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -loop-unroll | FileCheck %s +; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -loop-unroll | FileCheck %s -check-prefix=EPILOG +; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -loop-unroll -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG define void @unroll_opt_for_size() nounwind optsize { entry: br label %loop @@ -13,11 +14,17 @@ ret void } -; CHECK-LABEL: @unroll_opt_for_size -; CHECK: add -; CHECK-NEXT: add -; CHECK-NEXT: add -; CHECK: icmp +; EPILOG-LABEL: @unroll_opt_for_size +; EPILOG: add +; EPILOG-NEXT: add +; EPILOG-NEXT: add +; EPILOG: icmp + +; PROLOG-LABEL: @unroll_opt_for_size +; PROLOG: add +; PROLOG-NEXT: add +; PROLOG-NEXT: add +; PROLOG: icmp define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly { entry: @@ -40,8 +47,13 @@ ret i32 %sum.0.lcssa } -; CHECK-LABEL: @test -; CHECK: for.body.prol{{.*}}: -; CHECK: for.body: -; CHECK: br i1 %exitcond.7, label %for.end.loopexit{{.*}}, label %for.body +; EPILOG-LABEL: @test +; EPILOG: for.body: +; EPILOG: br i1 %niter.ncmp.7, label %for.end.loopexit{{.*}}, label %for.body +; EPILOG: for.body.epil{{.*}}: + +; PROLOG-LABEL: @test +; PROLOG: for.body.prol{{.*}}: +; PROLOG: for.body: +; PROLOG: br i1 %exitcond.7, label %for.end.loopexit{{.*}}, label %for.body Index: test/Transforms/LoopUnroll/X86/mmx.ll =================================================================== --- test/Transforms/LoopUnroll/X86/mmx.ll +++ test/Transforms/LoopUnroll/X86/mmx.ll @@ -14,9 +14,9 @@ exit: ; preds = %for.body %ret = phi x86_mmx [ undef, %for.body ] - ; CHECK: %[[ret_unr:.*]] = phi x86_mmx [ undef, - ; CHECK: %[[ret_ph:.*]] = phi x86_mmx [ undef, - ; CHECK: %[[ret:.*]] = phi x86_mmx [ %[[ret_unr]], {{.*}} ], [ %[[ret_ph]] + ; CHECK: %[[ret_ph:.*]] = phi x86_mmx [ undef, %entry + ; CHECK: %[[ret_ph1:.*]] = phi x86_mmx [ undef, + ; CHECK: %[[ret:.*]] = phi x86_mmx [ %[[ret_ph]], {{.*}} ], [ %[[ret_ph1]], ; CHECK: ret x86_mmx %[[ret]] ret x86_mmx %ret } Index: test/Transforms/LoopUnroll/high-cost-trip-count-computation.ll =================================================================== --- test/Transforms/LoopUnroll/high-cost-trip-count-computation.ll +++ test/Transforms/LoopUnroll/high-cost-trip-count-computation.ll @@ -34,7 +34,7 @@ ; CHECK: udiv ; CHECK: udiv ; CHECK-NOT: udiv -; CHECK-LABEL: for.body.prol +; CHECK-LABEL: for.body entry: %rem0 = load i64, i64* %loc, align 8 %ExpensiveComputation = udiv i64 %rem0, 42 ; <<< Extra computations are added to the trip-count expression Index: test/Transforms/LoopUnroll/runtime-loop.ll =================================================================== --- test/Transforms/LoopUnroll/runtime-loop.ll +++ test/Transforms/LoopUnroll/runtime-loop.ll @@ -1,18 +1,31 @@ -; RUN: opt < %s -S -loop-unroll -unroll-runtime=true | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-runtime=true | FileCheck %s -check-prefix=EPILOG +; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Tests for unrolling loops with run-time trip counts -; CHECK: %xtraiter = and i32 %n -; CHECK: %lcmp.mod = icmp ne i32 %xtraiter, 0 -; CHECK: br i1 %lcmp.mod, label %for.body.prol, label %for.body.preheader.split - -; CHECK: for.body.prol: -; CHECK: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.preheader ] -; CHECK: %prol.iter.sub = sub i32 %prol.iter, 1 -; CHECK: %prol.iter.cmp = icmp ne i32 %prol.iter.sub, 0 -; CHECK: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split, !llvm.loop !0 +; EPILOG: %xtraiter = and i32 %n +; EPILOG: %unroll_iter = sub i32 %n, %xtraiter +; EPILOG: %lcmp.mod = icmp ne i32 %unroll_iter, 0 +; EPILOG: br i1 %lcmp.mod, label %for.body.preheader.new, label %for.end.loopexit.unr-lcssa + +; PROLOG: %xtraiter = and i32 %n +; PROLOG: %lcmp.mod = icmp ne i32 %xtraiter, 0 +; PROLOG: br i1 %lcmp.mod, label %for.body.prol, label %for.body.prol.loopexit + +; EPILOG: for.body.epil: +; EPILOG: %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %for.body.epil ], [ %indvars.iv.unr, %for.body.epil.preheader ] +; EPILOG: %epil.iter.sub = sub i32 %epil.iter, 1 +; EPILOG: %epil.iter.cmp = icmp ne i32 %epil.iter.sub, 0 +; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop !0 + +; PROLOG: for.body.prol: +; PROLOG: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.preheader ] +; PROLOG: %prol.iter.sub = sub i32 %prol.iter, 1 +; PROLOG: %prol.iter.cmp = icmp ne i32 %prol.iter.sub, 0 +; PROLOG: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit, !llvm.loop !0 + define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly { entry: @@ -39,8 +52,11 @@ ; Still try to completely unroll loops with compile-time trip counts ; even if the -unroll-runtime is specified -; CHECK: for.body: -; CHECK-NOT: for.body.prol: +; EPILOG: for.body: +; EPILOG-NOT: for.body.epil: + +; PROLOG: for.body: +; PROLOG-NOT: for.body.prol: define i32 @test1(i32* nocapture %a) nounwind uwtable readonly { entry: @@ -64,7 +80,8 @@ ; This is test 2007-05-09-UnknownTripCount.ll which can be unrolled now ; if the -unroll-runtime option is turned on -; CHECK: bb72.2: +; EPILOG: bb72.2: +; PROLOG: bb72.2: define void @foo(i32 %trips) { entry: @@ -86,8 +103,11 @@ ; Test run-time unrolling for a loop that counts down by -2. -; CHECK: for.body.prol: -; CHECK: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split +; EPILOG: for.body.epil: +; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.cond.for.end_crit_edge.epilog-lcssa + +; PROLOG: for.body.prol: +; PROLOG: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit define zeroext i16 @down(i16* nocapture %p, i32 %len) nounwind uwtable readonly { entry: @@ -116,8 +136,11 @@ } ; Test run-time unrolling disable metadata. -; CHECK: for.body: -; CHECK-NOT: for.body.prol: +; EPILOG: for.body: +; EPILOG-NOT: for.body.epil: + +; PROLOG: for.body: +; PROLOG-NOT: for.body.prol: define zeroext i16 @test2(i16* nocapture %p, i32 %len) nounwind uwtable readonly { entry: @@ -148,6 +171,8 @@ !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: !0 = distinct !{!0, !1} -; CHECK: !1 = !{!"llvm.loop.unroll.disable"} +; EPILOG: !0 = distinct !{!0, !1} +; EPILOG: !1 = !{!"llvm.loop.unroll.disable"} +; PROLOG: !0 = distinct !{!0, !1} +; PROLOG: !1 = !{!"llvm.loop.unroll.disable"} Index: test/Transforms/LoopUnroll/runtime-loop1.ll =================================================================== --- test/Transforms/LoopUnroll/runtime-loop1.ll +++ test/Transforms/LoopUnroll/runtime-loop1.ll @@ -1,19 +1,35 @@ -; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 | FileCheck %s -check-prefix=EPILOG +; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG ; This tests that setting the unroll count works -; CHECK: for.body.preheader: -; CHECK: br {{.*}} label %for.body.prol, label %for.body.preheader.split, !dbg [[PH_LOC:![0-9]+]] -; CHECK: for.body.prol: -; CHECK: br label %for.body.preheader.split, !dbg [[BODY_LOC:![0-9]+]] -; CHECK: for.body.preheader.split: -; CHECK: br {{.*}} label %for.end.loopexit, label %for.body.preheader.split.split, !dbg [[PH_LOC]] -; CHECK: for.body: -; CHECK: br i1 %exitcond.1, label %for.end.loopexit.unr-lcssa, label %for.body, !dbg [[BODY_LOC]] -; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body -; CHECK-DAG: [[PH_LOC]] = !DILocation(line: 101, column: 1, scope: !{{.*}}) -; CHECK-DAG: [[BODY_LOC]] = !DILocation(line: 102, column: 1, scope: !{{.*}}) +; EPILOG: for.body.preheader: +; EPILOG: br i1 %lcmp.mod, label %for.body.preheader.new, label %for.end.loopexit.unr-lcssa, !dbg [[PH_LOC:![0-9]+]] +; EPILOG: for.body: +; EPILOG: br i1 %niter.ncmp.1, label %for.end.loopexit.unr-lcssa.loopexit, label %for.body, !dbg [[BODY_LOC:![0-9]+]] +; EPILOG-NOT: br i1 %niter.ncmp.2, label %for.end.loopexit{{.*}}, label %for.body +; EPILOG: for.body.epil.preheader: +; EPILOG: br label %for.body.epil, !dbg [[EXIT_LOC:![0-9]+]] +; EPILOG: for.body.epil: +; EPILOG: br label %for.end.loopexit.epilog-lcssa, !dbg [[BODY_LOC:![0-9]+]] + +; EPILOG-DAG: [[PH_LOC]] = !DILocation(line: 101, column: 1, scope: !{{.*}}) +; EPILOG-DAG: [[BODY_LOC]] = !DILocation(line: 102, column: 1, scope: !{{.*}}) +; EPILOG-DAG: [[EXIT_LOC]] = !DILocation(line: 103, column: 1, scope: !{{.*}}) + +; PROLOG: for.body.preheader: +; PROLOG: br {{.*}} label %for.body.prol, label %for.body.prol.loopexit, !dbg [[PH_LOC:![0-9]+]] +; PROLOG: for.body.prol: +; PROLOG: br label %for.body.prol.loopexit, !dbg [[BODY_LOC:![0-9]+]] +; PROLOG: for.body.prol.loopexit: +; PROLOG: br {{.*}} label %for.end.loopexit, label %for.body.preheader.new, !dbg [[PH_LOC]] +; PROLOG: for.body: +; PROLOG: br i1 %exitcond.1, label %for.end.loopexit.unr-lcssa, label %for.body, !dbg [[BODY_LOC]] +; PROLOG-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body + +; PROLOG-DAG: [[PH_LOC]] = !DILocation(line: 101, column: 1, scope: !{{.*}}) +; PROLOG-DAG: [[BODY_LOC]] = !DILocation(line: 102, column: 1, scope: !{{.*}}) define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly !dbg !6 { entry: Index: test/Transforms/LoopUnroll/runtime-loop2.ll =================================================================== --- test/Transforms/LoopUnroll/runtime-loop2.ll +++ test/Transforms/LoopUnroll/runtime-loop2.ll @@ -1,12 +1,18 @@ -; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-count=8 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-count=8 | FileCheck %s -check-prefix=EPILOG +; RUN: opt < %s -S -loop-unroll -unroll-threshold=25 -unroll-runtime -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG ; Choose a smaller, power-of-two, unroll count if the loop is too large. ; This test makes sure we're not unrolling 'odd' counts -; CHECK: for.body.prol: -; CHECK: for.body: -; CHECK: br i1 %exitcond.3, label %for.end.loopexit{{.*}}, label %for.body -; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body +; EPILOG: for.body: +; EPILOG: br i1 %niter.ncmp.3, label %for.end.loopexit.unr-lcssa.loopexit{{.*}}, label %for.body +; EPILOG-NOT: br i1 %niter.ncmp.4, label %for.end.loopexit.unr-lcssa.loopexit{{.*}}, label %for.body +; EPILOG: for.body.epil: + +; PROLOG: for.body.prol: +; PROLOG: for.body: +; PROLOG: br i1 %exitcond.3, label %for.end.loopexit{{.*}}, label %for.body +; PROLOG-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly { entry: Index: test/Transforms/LoopUnroll/runtime-loop4.ll =================================================================== --- test/Transforms/LoopUnroll/runtime-loop4.ll +++ test/Transforms/LoopUnroll/runtime-loop4.ll @@ -1,13 +1,22 @@ -; RUN: opt < %s -S -O2 -unroll-runtime=true | FileCheck %s +; RUN: opt < %s -S -O2 -unroll-runtime=true | FileCheck %s -check-prefix=EPILOG +; RUN: opt < %s -S -O2 -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG ; Check runtime unrolling prologue can be promoted by LICM pass. -; CHECK: entry: -; CHECK: %xtraiter -; CHECK: %lcmp.mod -; CHECK: loop1: -; CHECK: br i1 %lcmp.mod -; CHECK: loop2.prol: +; EPILOG: entry: +; EPILOG: %xtraiter +; EPILOG: %lcmp.mod +; EPILOG: %unroll_iter +; EPILOG: loop1: +; EPILOG: br i1 %lcmp.mod +; EPILOG: loop2.epil: + +; PROLOG: entry: +; PROLOG: %xtraiter +; PROLOG: %lcmp.mod +; PROLOG: loop1: +; PROLOG: br i1 %lcmp.mod +; PROLOG: loop2.prol: define void @unroll(i32 %iter, i32* %addr1, i32* %addr2) nounwind { entry: Index: test/Transforms/LoopUnroll/runtime-loop5.ll =================================================================== --- test/Transforms/LoopUnroll/runtime-loop5.ll +++ test/Transforms/LoopUnroll/runtime-loop5.ll @@ -11,9 +11,6 @@ %cmp1 = icmp eq i3 %n, 0 br i1 %cmp1, label %for.end, label %for.body -; UNROLL-16-NOT: for.body.prol: -; UNROLL-4: for.body.prol: - for.body: ; preds = %for.body, %entry ; UNROLL-16-LABEL: for.body: ; UNROLL-4-LABEL: for.body: @@ -39,6 +36,10 @@ ; UNROLL-16-LABEL: for.end ; UNROLL-4-LABEL: for.end + +; UNROLL-16-NOT: for.body.epil: +; UNROLL-4: for.body.epil: + for.end: ; preds = %for.body, %entry %sum.0.lcssa = phi i3 [ 0, %entry ], [ %add, %for.body ] ret i3 %sum.0.lcssa Index: test/Transforms/LoopUnroll/tripcount-overflow.ll =================================================================== --- test/Transforms/LoopUnroll/tripcount-overflow.ll +++ test/Transforms/LoopUnroll/tripcount-overflow.ll @@ -13,13 +13,14 @@ ; CHECK: entry: ; CHECK-NEXT: %0 = add i32 %N, 1 ; CHECK-NEXT: %xtraiter = and i32 %0, 1 -; CHECK-NEXT: %lcmp.mod = icmp ne i32 %xtraiter, 0 -; CHECK-NEXT: br i1 %lcmp.mod, label %while.body.prol, label %entry.split +; CHECK-NEXT: %unroll_iter = sub i32 %0, %xtraiter +; CHECK-NEXT: %lcmp.mod = icmp ne i32 %unroll_iter, 0 +; CHECK-NEXT: br i1 %lcmp.mod, label %entry.new, label %while.end.unr-lcssa -; CHECK: while.body.prol: -; CHECK: br label %entry.split +; CHECK: while.body.epil: +; CHECK: br label %while.end.epilog-lcssa -; CHECK: entry.split: +; CHECK: while.end.epilog-lcssa: ; Function Attrs: nounwind readnone ssp uwtable define i32 @foo(i32 %N) { Index: test/Transforms/LoopUnroll/unroll-cleanup.ll =================================================================== --- test/Transforms/LoopUnroll/unroll-cleanup.ll +++ test/Transforms/LoopUnroll/unroll-cleanup.ll @@ -4,14 +4,14 @@ ; RUN: opt < %s -O2 -S | FileCheck %s ; After loop unroll: -; %dec18 = add nsw i32 %dec18.in, -1 +; %niter.nsub = add nsw i32 %niter, -1 ; ... -; %dec18.1 = add nsw i32 %dec18, -1 +; %niter.nsub.1 = add nsw i32 %niter.nsub, -1 ; should be merged to: -; %dec18.1 = add nsw i32 %dec18.in, -2 +; %dec18.1 = add nsw i32 %niter, -2 ; ; CHECK-LABEL: @_Z3fn1v( -; CHECK: %dec18.1 = add nsw i32 %dec18.in, -2 +; CHECK: %niter.nsub.1 = add i32 %niter, -2 ; ModuleID = '' target triple = "x86_64-unknown-linux-gnu" Index: test/Transforms/LoopUnroll/unroll-pragmas.ll =================================================================== --- test/Transforms/LoopUnroll/unroll-pragmas.ll +++ test/Transforms/LoopUnroll/unroll-pragmas.ll @@ -171,10 +171,6 @@ ; should be duplicated (original and 4x unrolled). ; ; CHECK-LABEL: @runtime_loop_with_count4( -; CHECK: for.body.prol: -; CHECK: store -; CHECK-NOT: store -; CHECK: br i1 ; CHECK: for.body ; CHECK: store ; CHECK: store @@ -182,6 +178,10 @@ ; CHECK: store ; CHECK-NOT: store ; CHECK: br i1 +; CHECK: for.body.epil: +; CHECK: store +; CHECK-NOT: store +; CHECK: br i1 define void @runtime_loop_with_count4(i32* nocapture %a, i32 %b) { entry: %cmp3 = icmp sgt i32 %b, 0 @@ -287,10 +287,6 @@ ; (original and 8x). ; ; CHECK-LABEL: @runtime_loop_with_enable( -; CHECK: for.body.prol: -; CHECK: store -; CHECK-NOT: store -; CHECK: br i1 ; CHECK: for.body: ; CHECK: store i32 ; CHECK: store i32 @@ -302,6 +298,10 @@ ; CHECK: store i32 ; CHECK-NOT: store i32 ; CHECK: br i1 +; CHECK: for.body.epil: +; CHECK: store +; CHECK-NOT: store +; CHECK: br i1 define void @runtime_loop_with_enable(i32* nocapture %a, i32 %b) { entry: %cmp3 = icmp sgt i32 %b, 0