Index: include/llvm/Analysis/LoopInfo.h =================================================================== --- include/llvm/Analysis/LoopInfo.h +++ include/llvm/Analysis/LoopInfo.h @@ -561,6 +561,14 @@ /// Otherwise return null. BasicBlock *getUniqueExitBlock() const; + /// If among getUniqueExitBlocks there is only one block with successors, + /// return it (ignoring blocks with exit, assert, throw...). + /// Otherwise return null. + BasicBlock *getUniqueReachableExitBlock() const; + + /// Same as above, but for exiting blocks. + BasicBlock *getExitingToReachableBlock() const; + void dump() const; void dumpVerbose() const; Index: include/llvm/Transforms/Utils/UnrollLoop.h =================================================================== --- include/llvm/Transforms/Utils/UnrollLoop.h +++ include/llvm/Transforms/Utils/UnrollLoop.h @@ -69,7 +69,7 @@ AssumptionCache *AC, bool PreserveLCSSA); -void computePeelCount(Loop *L, unsigned LoopSize, +void computePeelCount(Loop *L, DominatorTree &DT, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, unsigned &TripCount, ScalarEvolution &SE); Index: lib/Analysis/LoopInfo.cpp =================================================================== --- lib/Analysis/LoopInfo.cpp +++ lib/Analysis/LoopInfo.cpp @@ -432,6 +432,63 @@ } } +BasicBlock *Loop::getUniqueReachableExitBlock() const { + SmallVector UniqueExitBlocks; + getUniqueExitBlocks(UniqueExitBlocks); + if (UniqueExitBlocks.size() == 1) + return UniqueExitBlocks[0]; + BasicBlock *ReachableExit = nullptr; + for (BasicBlock *BB : UniqueExitBlocks) { + const TerminatorInst *TI = BB->getTerminator(); + if (TI->getNumSuccessors() == 0 && (isa(TI) || + // If this block is terminated by a call to + // @llvm.experimental.deoptimize then treat it like an unreachable since + // the @llvm.experimental.deoptimize call is expected to practically + // never execute. + BB->getTerminatingDeoptimizeCall())) + continue; + + // If found second reachable exit return nullptr. + if (ReachableExit) { + return nullptr; + } + ReachableExit = BB; + } + return ReachableExit; +} + +BasicBlock *Loop::getExitingToReachableBlock() const { + SmallVector ExitingBlocks; + getExitingBlocks(ExitingBlocks); + if (ExitingBlocks.size() == 1) + return ExitingBlocks[0]; + BasicBlock *ExitingToReachable = nullptr; + for (BasicBlock *BB : ExitingBlocks) { + BasicBlock *EB = nullptr; + for (BasicBlock *Succ : successors(BB)) { + if (contains(Succ)) + continue; + if (EB) + return nullptr; + EB = Succ; + } + const TerminatorInst *TI = EB->getTerminator(); + if (TI->getNumSuccessors() == 0 && (isa(TI) || + // If this block is terminated by a call to + // @llvm.experimental.deoptimize then treat it like an unreachable since + // the @llvm.experimental.deoptimize call is expected to practically + // never execute. + EB->getTerminatingDeoptimizeCall())) + continue; + // If found second reachable exit return nullptr. + if (ExitingToReachable) { + return nullptr; + } + ExitingToReachable = BB; + } + return ExitingToReachable; +} + BasicBlock *Loop::getUniqueExitBlock() const { SmallVector UniqueExitBlocks; getUniqueExitBlocks(UniqueExitBlocks); Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -795,7 +795,7 @@ } // 4th priority is loop peeling - computePeelCount(L, LoopSize, UP, TripCount, SE); + computePeelCount(L, DT, LoopSize, UP, TripCount, SE); if (UP.PeelCount) { UP.Runtime = false; UP.Count = 1; Index: lib/Transforms/Utils/LoopUnrollPeel.cpp =================================================================== --- lib/Transforms/Utils/LoopUnrollPeel.cpp +++ lib/Transforms/Utils/LoopUnrollPeel.cpp @@ -22,6 +22,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -75,14 +76,14 @@ return false; // Only peel loops that contain a single exit - if (!L->getExitingBlock() || !L->getUniqueExitBlock()) + if (!L->getExitingToReachableBlock() || !L->getUniqueReachableExitBlock()) return false; // Don't try to peel loops where the latch is not the exiting block. // This can be an indication of two different things: // 1) The loop is not rotated. // 2) The loop contains irreducible control flow that involves the latch. - if (L->getLoopLatch() != L->getExitingBlock()) + if (L->getLoopLatch() != L->getExitingToReachableBlock()) return false; return true; @@ -217,7 +218,7 @@ } // Return the number of iterations we want to peel off. -void llvm::computePeelCount(Loop *L, unsigned LoopSize, +void llvm::computePeelCount(Loop *L, DominatorTree &DT, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, unsigned &TripCount, ScalarEvolution &SE) { assert(LoopSize > 0 && "Zero loop size is not allowed!"); @@ -229,6 +230,51 @@ if (!L->empty()) return; + // Peel first iteration in the following case. + if (2 * LoopSize <= UP.Threshold && !L->getUniqueExitBlock() && + L->getUniqueReachableExitBlock()) { + // There is more than 1 Exit block, but only 1 is reachable + // (has successors). + SmallVector ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + bool SafeToSpeculate; + for (BasicBlock *BB : L->blocks()) { + for (BasicBlock *EB : ExitBlocks) { + if (!DT.dominates(BB, EB)) { + SafeToSpeculate = false; + break; + } + } + // All instructions before first exit are ok to speculate. + // Peeling will not help here. + if (SafeToSpeculate) + continue; + + for (Instruction &I : *BB) { + if (isa(&I) || isa(&I)) + continue; + // Skip store and call instructions as they are unsafe to speculate + // even after peeling. + bool OperandsInv = true; + for (Value *Op : I.operands()) { + if (!L->isLoopInvariant(Op)) { + OperandsInv = false; + break; + } + } + // If there is an instruction with invariant operands and unsafe to + // speculate - that is because of unreachable exit block prior to it. + // Peeling first iteration will prove the instruction is safe. + if (OperandsInv && !isSafeToSpeculativelyExecute(&I)) { + DEBUG(dbgs() << "Peel 1 iteration to make instruction " << I + << " safe to speculate.\n"); + UP.PeelCount = 1; + return; + } + } + SafeToSpeculate = true; + } + } // Here we try to get rid of Phis which become invariants after 1, 2, ..., N // iterations of the loop. For this we compute the number for iterations after // which every Phi is guaranteed to become an invariant, and try to peel the @@ -482,10 +528,13 @@ BasicBlock *Header = L->getHeader(); BasicBlock *PreHeader = L->getLoopPreheader(); BasicBlock *Latch = L->getLoopLatch(); - BasicBlock *Exit = L->getUniqueExitBlock(); + BasicBlock *Exit = L->getUniqueReachableExitBlock(); Function *F = Header->getParent(); + SmallVector ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + // Set up all the necessary basic blocks. It is convenient to split the // preheader into 3 parts - two blocks to anchor the peeled copy of the loop // body, and a new preheader for the "real" loop. @@ -579,6 +628,27 @@ // previous one. remapInstructionsInBlocks(NewBlocks, VMap); + // Because now unreachable Exits are allowed we need to adjust phi nodes + // there. After peeling each Exit block could have more predecessors + // blocks. + for (BasicBlock *EB : ExitBlocks) { + if (EB == Exit) + continue; + for (BasicBlock::iterator I = EB->begin(); isa(I); ++I) { + PHINode *PHI = cast(I); + for (BasicBlock *P : predecessors(EB)) { + if (!L->contains(P)) + continue; + if (VMap[PHI->getIncomingValueForBlock(P)]) + // If value defined in the loop. + PHI->addIncoming(VMap[PHI->getIncomingValueForBlock(P)], cast(VMap[P])); + else + // If value defined outside of the loop. + PHI->addIncoming(PHI->getIncomingValueForBlock(P), cast(VMap[P])); + } + } + } + if (DT) { // Latches of the cloned loops dominate over the loop exit, so idom of the // latter is the first cloned loop body, as original PreHeader dominates Index: test/Transforms/LoopUnroll/peel-loop-spec-load.ll =================================================================== --- test/Transforms/LoopUnroll/peel-loop-spec-load.ll +++ test/Transforms/LoopUnroll/peel-loop-spec-load.ll @@ -0,0 +1,155 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -loop-unroll -unroll-allow-peeling -unroll-threshold=150 | FileCheck %s + +; Generated from the following source: +; +; int foo(std::vector *a, std::vector *b) { +; int s = 0; +; for (int i = 0; i < 1024; i++) { +; s += a->at(i) + b->at(i); +; return s; +; } +; +; Peeling is profitable here as each "at" generates bound check and +; exception. The bound check has invariant load - unsafe to speculate, +; due to exception. +; When first iteration is hoisted out of the loop all invariant loads are +; executed before loop and thus become safe to speculate. +; + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%"class.std::vector" = type { %"struct.std::_Vector_base" } +%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base >::_Vector_impl" } +%"struct.std::_Vector_base >::_Vector_impl" = type { i32*, i32*, i32* } + +@.str = private unnamed_addr constant [23 x i8] c"vector::_M_range_check\00", align 1 + +; Function Attrs: noinline uwtable +define dso_local i32 @_Z3fooPSt6vectorIiSaIiEES2_i(%"class.std::vector"* nocapture readonly, %"class.std::vector"* nocapture readonly, i32) local_unnamed_addr #0 { +; CHECK-LABEL: @_Z3fooPSt6vectorIiSaIiEES2_i( +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[TMP0:%.*]], i64 0, i32 0, i32 0, i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i64* +; CHECK-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast %"class.std::vector"* [[TMP0]] to i64* +; CHECK-NEXT: [[TMP8:%.*]] = load i64, i64* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = ashr exact i64 [[TMP9]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[TMP1:%.*]], i64 0, i32 0, i32 0, i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32** [[TMP11]] to i64* +; CHECK-NEXT: [[TMP13:%.*]] = bitcast %"class.std::vector"* [[TMP1]] to i64* +; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i32* +; CHECK-NEXT: br label [[DOTPEEL_BEGIN:%.*]] +; CHECK: .peel.begin: +; CHECK-NEXT: br label [[TMP15:%.*]] +; CHECK: [[TMP16:%.*]] = icmp ugt i64 [[TMP10]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label [[TMP17:%.*]], label [[TMP38:%.*]] +; CHECK: [[TMP18:%.*]] = load i64, i64* [[TMP12]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load i64, i64* [[TMP13]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = sub i64 [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = ashr exact i64 [[TMP20]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = icmp ugt i64 [[TMP21]], 0 +; CHECK-NEXT: br i1 [[TMP22]], label [[TMP23:%.*]], label [[TMP45:%.*]] +; CHECK: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i64 0 +; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to i32* +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[TMP26]], i64 0 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP25]], 0 +; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP29]], [[TMP28]] +; CHECK-NEXT: [[TMP31:%.*]] = add nuw nsw i64 0, 1 +; CHECK-NEXT: [[TMP32:%.*]] = icmp ult i64 [[TMP31]], 1024 +; CHECK-NEXT: br i1 [[TMP32]], label [[DOTPEEL_NEXT:%.*]], label [[TMP33:%.*]] +; CHECK: .peel.next: +; CHECK-NEXT: br label [[DOTPEEL_NEXT1:%.*]] +; CHECK: .peel.next1: +; CHECK-NEXT: br label [[DOTPEEL_NEWPH:%.*]] +; CHECK: .peel.newph: +; CHECK-NEXT: br label [[TMP34:%.*]] +; CHECK: .loopexit3: +; CHECK-NEXT: [[DOTLCSSA_PH:%.*]] = phi i32 [ [[TMP53:%.*]], [[TMP46:%.*]] ] +; CHECK-NEXT: br label [[TMP33]] +; CHECK: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP30]], [[TMP23]] ], [ [[DOTLCSSA_PH]], [[DOTLOOPEXIT3:%.*]] ] +; CHECK-NEXT: ret i32 [[DOTLCSSA]] +; CHECK: [[TMP35:%.*]] = phi i64 [ [[TMP31]], [[DOTPEEL_NEWPH]] ], [ [[TMP54:%.*]], [[TMP46]] ] +; CHECK-NEXT: [[TMP36:%.*]] = phi i32 [ [[TMP30]], [[DOTPEEL_NEWPH]] ], [ [[TMP53]], [[TMP46]] ] +; CHECK-NEXT: [[TMP37:%.*]] = icmp ugt i64 [[TMP10]], [[TMP35]] +; CHECK-NEXT: br i1 [[TMP37]], label [[TMP39:%.*]], label [[DOTLOOPEXIT:%.*]] +; CHECK: .loopexit: +; CHECK-NEXT: br label [[TMP38]] +; CHECK: tail call void @_ZSt20__throw_out_of_rangePKc(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i64 0, i64 0)) +; CHECK-NEXT: unreachable +; CHECK: [[TMP40:%.*]] = load i64, i64* [[TMP12]], align 8 +; CHECK-NEXT: [[TMP41:%.*]] = load i64, i64* [[TMP13]], align 8 +; CHECK-NEXT: [[TMP42:%.*]] = sub i64 [[TMP40]], [[TMP41]] +; CHECK-NEXT: [[TMP43:%.*]] = ashr exact i64 [[TMP42]], 2 +; CHECK-NEXT: [[TMP44:%.*]] = icmp ugt i64 [[TMP43]], [[TMP35]] +; CHECK-NEXT: br i1 [[TMP44]], label [[TMP46]], label [[DOTLOOPEXIT2:%.*]] +; CHECK: .loopexit2: +; CHECK-NEXT: br label [[TMP45]] +; CHECK: tail call void @_ZSt20__throw_out_of_rangePKc(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i64 0, i64 0)) +; CHECK-NEXT: unreachable +; CHECK: [[TMP47:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i64 [[TMP35]] +; CHECK-NEXT: [[TMP48:%.*]] = load i32, i32* [[TMP47]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = inttoptr i64 [[TMP41]] to i32* +; CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP49]], i64 [[TMP35]] +; CHECK-NEXT: [[TMP51:%.*]] = load i32, i32* [[TMP50]], align 4 +; CHECK-NEXT: [[TMP52:%.*]] = add i32 [[TMP48]], [[TMP36]] +; CHECK-NEXT: [[TMP53]] = add i32 [[TMP52]], [[TMP51]] +; CHECK-NEXT: [[TMP54]] = add nuw nsw i64 [[TMP35]], 1 +; CHECK-NEXT: [[TMP55:%.*]] = icmp ult i64 [[TMP54]], 1024 +; CHECK-NEXT: br i1 [[TMP55]], label [[TMP34]], label [[DOTLOOPEXIT3]], !llvm.loop !0 +; + %4 = getelementptr inbounds %"class.std::vector", %"class.std::vector"* %0, i64 0, i32 0, i32 0, i32 1 + %5 = bitcast i32** %4 to i64* + %6 = load i64, i64* %5, align 8 + %7 = bitcast %"class.std::vector"* %0 to i64* + %8 = load i64, i64* %7, align 8 + %9 = sub i64 %6, %8 + %10 = ashr exact i64 %9, 2 + %11 = getelementptr inbounds %"class.std::vector", %"class.std::vector"* %1, i64 0, i32 0, i32 0, i32 1 + %12 = bitcast i32** %11 to i64* + %13 = bitcast %"class.std::vector"* %1 to i64* + %14 = inttoptr i64 %8 to i32* + br label %16 + +;