Index: lib/Transforms/Utils/LoopUnrollRuntime.cpp =================================================================== --- lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -45,6 +45,10 @@ STATISTIC(NumRuntimeUnrolled, "Number of loops unrolled with run-time trip counts"); +static cl::opt UnrollRuntimeMultiExit( + "unroll-runtime-multi-exit", cl::init(false), cl::Hidden, + cl::desc("Allow runtime unrolling for loops with multiple exits, when " + "epilog is generated")); /// Connect the unrolling prolog code to the original loop. /// The unrolling prolog code contains code to execute the @@ -286,14 +290,13 @@ /// If loop structure is cloned InsertTop should be new preheader, InsertBot /// new loop exit. /// -static void CloneLoopBlocks(Loop *L, Value *NewIter, - const bool CreateRemainderLoop, - const bool UseEpilogRemainder, - BasicBlock *InsertTop, BasicBlock *InsertBot, - BasicBlock *Preheader, - std::vector &NewBlocks, - LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, - DominatorTree *DT, LoopInfo *LI) { +static void +CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop, + const bool UseEpilogRemainder, BasicBlock *InsertTop, + BasicBlock *InsertBot, BasicBlock *Preheader, + std::vector &NewBlocks, LoopBlocksDFS &LoopBlocks, + ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI, + SmallVectorImpl &OtherExits) { StringRef suffix = UseEpilogRemainder ? "epil" : "prol"; BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); @@ -419,6 +422,43 @@ NewLoopID->replaceOperandWith(0, NewLoopID); NewLoop->setLoopID(NewLoopID); } + + // At this stage, the loop blocks are cloned for the prolog/epilog. Clone the + // other exit blocks as well for the epilog, and update the Vmap and dominator + // tree info. + for (auto *BB : OtherExits) { + + BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F); + NewBB->setName(BB->getName() + ".epil"); + for (auto &II : *BB) { + Instruction *NewInst = II.clone(); + // For phi nodes, get the appropriate values and the block from the VMap. + if (isa(II)) { + PHINode *OldPHI = cast(&II); + PHINode *NewPHI = cast(NewInst); + for (unsigned int i = 0; i < OldPHI->getNumIncomingValues(); i++) { + BasicBlock *NewPred = + cast(VMap[OldPHI->getIncomingBlock(i)]); + Value *InVal = OldPHI->getIncomingValue(i); + NewPHI->setIncomingBlock(i, NewPred); + if (Value *V = VMap.lookup(InVal)) + NewPHI->setIncomingValue(i, V); + } + NewInst = NewPHI; + } + if (II.hasName()) + NewInst->setName(II.getName() + ".epil"); + VMap[&II] = NewInst; + NewBB->getInstList().push_back(NewInst); + } + VMap[BB] = NewBB; + NewBlocks.push_back(NewBB); + + if (DT) { + BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock(); + DT->addNewBlock(NewBB, cast(VMap[IDomBB])); + } + } } /// Insert code in the prolog/epilog code when unrolling a loop with a @@ -465,14 +505,50 @@ LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, bool PreserveLCSSA) { // for now, only unroll loops that contain a single exit - if (!L->getExitingBlock()) + if (!UnrollRuntimeMultiExit && !L->getExitingBlock()) return false; - // Make sure the loop is in canonical form, and there is a single - // exit block only. + // Make sure the loop is in canonical form. if (!L->isLoopSimplifyForm()) return false; + + // Since loop is in simplify form, we'll have exactly one latch. + BasicBlock *Latch = L->getLoopLatch(); + BasicBlock *Header = L->getHeader(); + BasicBlock *Exit = L->getUniqueExitBlock(); // successor out of loop + // These are exit blocks other than the target of the latch exiting block. + SmallVector OtherExits; + BranchInst *LatchBR = cast(Latch->getTerminator()); + unsigned int ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0; + // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the + // targets of the Latch be an exit block out of the loop. This needs + // to be guaranteed by the callers of UnrollRuntimeLoopRemainder. + assert(!L->contains(LatchBR->getSuccessor(ExitIndex)) && + "one of the loop latch successors should be the exit block!"); + // Support runtime unrolling for multiple exit blocks and multiple exiting + // blocks. + if (!Exit && UnrollRuntimeMultiExit) { + assert(UseEpilogRemainder && "Multi exit unrolling is currently supported " + "unrolling with epilog remainder only!"); + Exit = LatchBR->getSuccessor(ExitIndex); + // TODO: Support multiple exiting blocks jumping to the `Exit`. This + // will need updating the logic in connectEpilog. + if (!Exit->getSinglePredecessor()) + return false; + SmallVector Exits; + L->getUniqueExitBlocks(Exits); + for (auto *BB : Exits) { + if (BB == Exit) + continue; + // TODO: For now, all other exit blocks should have no successors (only + // `Exit` block can have successors). + if (BB->getTerminator()->getNumSuccessors()) + return false; + OtherExits.push_back(BB); + } + } + if (!Exit) return false; @@ -483,7 +559,12 @@ // Only unroll loops with a computable trip count, and the trip count needs // to be an int value (allowing a pointer type is a TODO item). - const SCEV *BECountSC = SE->getBackedgeTakenCount(L); + // Since we support multiple exit blocks, we cannot use getBackEdgeTakenCount + // which finds SCEV for single exit loops. Since the main exit is the latch + // exit, we take the exitCount for the latch. This is equivalent to + // getBackEdgeTakenCount when we have just one exit, and the exiting block is + // the latch. + const SCEV *BECountSC = SE->getExitCount(L, Latch); if (isa(BECountSC) || !BECountSC->getType()->isIntegerTy()) return false; @@ -496,7 +577,6 @@ if (isa(TripCountSC)) return false; - BasicBlock *Header = L->getHeader(); BasicBlock *PreHeader = L->getLoopPreheader(); BranchInst *PreHeaderBR = cast(PreHeader->getTerminator()); const DataLayout &DL = Header->getModule()->getDataLayout(); @@ -509,19 +589,6 @@ // comment on ModVal below. if (Log2_32(Count) > BEWidth) return false; - - BasicBlock *Latch = L->getLoopLatch(); - - // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the - // targets of the Latch be the single exit block out of the loop. This needs - // to be guaranteed by the callers of UnrollRuntimeLoopRemainder. - BranchInst *LatchBR = cast(Latch->getTerminator()); - assert( - (LatchBR->getSuccessor(0) == Exit || LatchBR->getSuccessor(1) == Exit) && - "one of the loop latch successors should be " - "the exit block!"); - // Avoid warning of unused `LatchBR` variable in release builds. - (void)LatchBR; // Loop structure is the following: // // PreHeader @@ -651,7 +718,8 @@ BasicBlock *InsertBot = UseEpilogRemainder ? Exit : PrologExit; BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, - InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI); + InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI, + OtherExits); // Insert the cloned blocks into the function. F->getBasicBlockList().splice(InsertBot->getIterator(), @@ -659,6 +727,7 @@ NewBlocks[0]->getIterator(), F->end()); + // Loop structure should be the following: // Epilog Prolog // Index: test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll @@ -0,0 +1,131 @@ +; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-runtime-multi-exit=true -verify-dom-info -instcombine + +; test with three exiting and three exit blocks. +define void @test1(i64 %trip) { +; CHECK-LABEL: test1 +; CHECK: %niter.nsub.7 = add i64 %niter, -8 +entry: + br label %loop_header +loop_header: + %iv = phi i64 [ 0, %entry ], [ %iv_next, %loop_latch ] + br i1 undef, label %loop_latch, label %loop_exiting_bb1 +loop_exiting_bb1: + br i1 false, label %loop_exiting_bb2, label %exit1 +loop_exiting_bb2: + br i1 false, label %loop_latch, label %exit3 +exit3: + ret void +loop_latch: + %iv_next = add i64 %iv, 1 + %cmp = icmp ne i64 %iv_next, %trip + br i1 %cmp, label %loop_header, label %exit2.loopexit +exit1: + ret void +exit2.loopexit: + ret void +} + + +; test with three exiting and two exit blocks +; There are 2 values for the return that are calculated at every iteration. +; %sum.02 and %add. Both of these are incoming values for phi from every exiting +; unrolled block. +define i32 @test2(i32* nocapture %a, i64 %n) nounwind uwtable readonly { +; CHECK-LABEL: test2 +; CHECK-LABEL: for.exit2: +; CHECK-NEXT: %retval = phi i32 [ %sum.02, %header ], [ 42, %for.exiting_block ], [ %add, %for.body ], [ 42, %for.exiting_block.1 ], [ %add.1, %for.body.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %for.body.2 ], [ 42, %for.exiting_block.3 ], +; CHECK-LABEL: for.exit2.epil: +; CHECK-NEXT: %retval.epil = phi i32 [ %sum.02.epil, %header.epil ], [ 42, %for.exiting_block.epil ] +; CHECK: %niter.nsub.7 = add i64 %niter, -8 +entry: + br label %header + +header: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ] + br i1 false, label %for.exit2, label %for.exiting_block + +for.exiting_block: + %cmp = icmp eq i64 %n, 42 + br i1 %cmp, label %for.exit2, label %for.body + +for.body: ; preds = %for.body, %entry + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %sum.02 + %indvars.iv.next = add i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.end, label %header + +for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ %add, %for.body ] + ret i32 %sum.0.lcssa + +for.exit2: + %retval = phi i32 [ %sum.02, %header ], [ 42, %for.exiting_block ] + ret i32 %retval +} + +; FIXME: Support multiple exiting blocks to the same latch exit block. +define i32 @test3(i32* nocapture %a, i64 %n) nounwind uwtable readonly { +; CHECK-LABEL: test3 +; CHECK-NOT: .unr +entry: + br label %header + +header: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ] + br i1 undef, label %for.end, label %for.exiting_block + ;br i1 undef, label %for.exit2, label %for.exiting_block + +for.exiting_block: + %cmp = icmp eq i64 %n, 42 + br i1 %cmp, label %for.exit2, label %for.body + +for.body: ; preds = %for.body, %entry + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %sum.02 + %indvars.iv.next = add i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.end, label %header + +for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ 0, %header ], [ %add, %for.body ] + ret i32 %sum.0.lcssa + +for.exit2: + ret i32 42 +} + + +; test with two exiting and three exit blocks. +; the non-latch exiting block has a switch. +define void @test4(i64 %trip, i64 %add) { +; CHECK-LABEL: test4 +; CHECK: %niter.nsub.7 = add i64 %niter, -8 +entry: + br label %loop_header +loop_header: + %iv = phi i64 [ 0, %entry ], [ %iv_next, %loop_latch ] + %sum = phi i64 [ 0, %entry ], [ %sum.next, %loop_latch ] + br i1 undef, label %loop_latch, label %loop_exiting_bb1 +loop_exiting_bb1: + switch i64 %sum, label %loop_latch [ + i64 24, label %exit1 + i64 42, label %exit3 + ] + +exit3: + ret void +loop_latch: + %iv_next = add nuw nsw i64 %iv, 1 + %sum.next = add i64 %sum, %add + %cmp = icmp ne i64 %iv_next, %trip + br i1 %cmp, label %loop_header, label %exit2.loopexit +exit1: + ret void +exit2.loopexit: + ret void +}