Index: lib/Transforms/Scalar/SimpleLoopUnswitch.cpp =================================================================== --- lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/GuardUtils.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" @@ -59,6 +60,7 @@ STATISTIC(NumBranches, "Number of branches unswitched"); STATISTIC(NumSwitches, "Number of switches unswitched"); +STATISTIC(NumGuards, "Number of guards turned into branches for unswitching"); STATISTIC(NumTrivial, "Number of unswitches that are trivial"); static cl::opt EnableNonTrivialUnswitch( @@ -70,6 +72,11 @@ UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden, cl::desc("The cost threshold for unswitching a loop.")); +static cl::opt UnswitchGuards( + "simple-loop-unswitch-guards", cl::init(true), cl::Hidden, + cl::desc("If enabled, simple loop unswitching will also consider " + "llvm.experimental.guard intrinsics as unswitch candidates.")); + /// Collect all of the loop invariant input values transitively used by the /// homogeneous instruction graph from a given root. /// @@ -2169,6 +2176,77 @@ return Cost; } +/// Turns a llvm.experimental.guard intrinsic into implicit control flow branch, +/// making the following replacement: +/// +/// +/// call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] +/// +/// +/// into +/// +/// +/// br i1 %cond, label %guarded, label %deopt +/// +/// guarded: +/// +/// +/// deopt: +/// call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +/// unreachable +/// +/// It also makes all relevant DT and LI updates, so that all structures are in +/// valid state after this transform. +static BranchInst * +turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, + SmallVectorImpl &ExitBlocks, + DominatorTree &DT, LoopInfo &LI) { + SmallVector DTUpdates; + LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n"); + BasicBlock *CheckBB = GI->getParent(); + + // Remove all CheckBB's successors from DomTree. A block can be seen among + // successors more than once, but for DomTree it should be added only once. + SmallPtrSet Successors; + for (auto *Succ : successors(CheckBB)) + if (Successors.insert(Succ).second) + DTUpdates.push_back({DominatorTree::Delete, CheckBB, Succ}); + + Instruction *DeoptBlockTerm = + SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true); + BranchInst *CheckBI = cast(CheckBB->getTerminator()); + // SplitBlockAndInsertIfThen inserts control flow that branches to + // DeoptBlockTerm if the condition is true. We want the opposite. + CheckBI->swapSuccessors(); + + BasicBlock *GuardedBlock = CheckBI->getSuccessor(0); + GuardedBlock->setName("guarded"); + CheckBI->getSuccessor(1)->setName("deopt"); + + // We now have a new exit block. + ExitBlocks.push_back(CheckBI->getSuccessor(1)); + + GI->moveBefore(DeoptBlockTerm); + GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext())); + + // Add new successors of CheckBB into DomTree. + for (auto *Succ : successors(CheckBB)) + DTUpdates.push_back({DominatorTree::Insert, CheckBB, Succ}); + + // Now the blocks that used to be CheckBB's successors are GuardedBlock's + // successors. + for (auto *Succ : Successors) + DTUpdates.push_back({DominatorTree::Insert, GuardedBlock, Succ}); + + // Make proper changes to DT. + DT.applyUpdates(DTUpdates); + // Inform LI of a new loop block. + L.addBasicBlockToLoop(GuardedBlock, LI); + + ++NumGuards; + return CheckBI; +} + static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, TargetTransformInfo &TTI, @@ -2178,10 +2256,29 @@ // loop which would be handled when visiting that inner loop). SmallVector>, 4> UnswitchCandidates; + + // Whether or not we should also collect guards in the loop. + bool CollectGuards = false; + if (UnswitchGuards) { + auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction( + Intrinsic::getName(Intrinsic::experimental_guard)); + if (GuardDecl && !GuardDecl->use_empty()) + CollectGuards = true; + } + for (auto *BB : L.blocks()) { if (LI.getLoopFor(BB) != &L) continue; + if (CollectGuards) + for (auto &I : *BB) + if (isGuard(&I)) { + auto *Cond = cast(&I)->getArgOperand(0); + // TODO: Support AND, OR conditions and partial unswitching. + if (!isa(Cond) && L.isLoopInvariant(Cond)) + UnswitchCandidates.push_back({&I, {Cond}}); + } + if (auto *SI = dyn_cast(BB->getTerminator())) { // We can only consider fully loop-invariant switch conditions as we need // to completely eliminate the switch after unswitching. @@ -2346,9 +2443,12 @@ // Now scale the cost by the number of unique successors minus one. We // subtract one because there is already at least one copy of the entire // loop. This is computing the new cost of unswitching a condition. - assert(Visited.size() > 1 && + // Note that guards always have 2 unique successors that are implicit and + // will be materialized if we decide to unswitch it. + int SuccessorsCount = isGuard(&TI) ? 2 : Visited.size(); + assert(SuccessorsCount > 1 && "Cannot unswitch a condition without multiple distinct successors!"); - return Cost * (Visited.size() - 1); + return Cost * (SuccessorsCount - 1); }; Instruction *BestUnswitchTI = nullptr; int BestUnswitchCost; @@ -2375,6 +2475,11 @@ return false; } + // If the best candidate is a guard, turn it into a branch. + if (isGuard(BestUnswitchTI)) + BestUnswitchTI = turnGuardIntoBranch(cast(BestUnswitchTI), L, + ExitBlocks, DT, LI); + LLVM_DEBUG(dbgs() << " Unswitching non-trivial (cost = " << BestUnswitchCost << ") terminator: " << *BestUnswitchTI << "\n"); Index: test/Transforms/SimpleLoopUnswitch/guards.ll =================================================================== --- /dev/null +++ test/Transforms/SimpleLoopUnswitch/guards.ll @@ -0,0 +1,238 @@ +; RUN: opt -passes='loop(unswitch),verify' -enable-nontrivial-unswitch -simple-loop-unswitch-guards -S < %s | FileCheck %s +; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -simple-loop-unswitch-guards -S < %s | FileCheck %s + +declare void @llvm.experimental.guard(i1, ...) + +define void @test_simple_case(i1 %cond, i32 %N) { +; CHECK-LABEL: @test_simple_case( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] +; CHECK: entry.split.us: +; CHECK-NEXT: br label [[LOOP_US:%.*]] +; CHECK: loop.us: +; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ] +; CHECK-NEXT: br label [[GUARDED_US]] +; CHECK: guarded.us: +; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 +; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]] +; CHECK: deopt: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +; CHECK-NEXT: unreachable +; + +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] + %iv.next = add i32 %iv, 1 + %loop.cond = icmp slt i32 %iv.next, %N + br i1 %loop.cond, label %loop, label %exit + +exit: + ret void +} + +define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) { +; CHECK-LABEL: @test_two_guards( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] +; CHECK: entry.split.us: +; CHECK-NEXT: br i1 [[COND2:%.*]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]] +; CHECK: entry.split.us.split.us: +; CHECK-NEXT: br label [[LOOP_US_US:%.*]] +; CHECK: loop.us.us: +; CHECK-NEXT: [[IV_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], [[GUARDED_US2:%.*]] ] +; CHECK-NEXT: br label [[GUARDED_US_US:%.*]] +; CHECK: guarded.us.us: +; CHECK-NEXT: br label [[GUARDED_US2]] +; CHECK: guarded.us2: +; CHECK-NEXT: [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1 +; CHECK-NEXT: [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND_US_US]], label [[LOOP_US_US]], label [[EXIT_SPLIT_US_SPLIT_US:%.*]] +; CHECK: deopt1: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +; CHECK-NEXT: unreachable +; CHECK: deopt: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +; CHECK-NEXT: unreachable +; CHECK: exit: +; CHECK-NEXT: ret void +; + +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + call void (i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ] + call void (i1, ...) @llvm.experimental.guard(i1 %cond2) [ "deopt"() ] + %iv.next = add i32 %iv, 1 + %loop.cond = icmp slt i32 %iv.next, %N + br i1 %loop.cond, label %loop, label %exit + +exit: + ret void +} + +define void @test_conditional_guards(i1 %cond, i32 %N) { +; CHECK-LABEL: @test_conditional_guards( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] +; CHECK: entry.split.us: +; CHECK-NEXT: br label [[LOOP_US:%.*]] +; CHECK: loop.us: +; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[BACKEDGE_US:%.*]] ] +; CHECK-NEXT: [[CONDITION_US:%.*]] = icmp eq i32 [[IV_US]], 123 +; CHECK-NEXT: br i1 [[CONDITION_US]], label [[GUARD_US:%.*]], label [[BACKEDGE_US]] +; CHECK: guard.us: +; CHECK-NEXT: br label [[GUARDED_US:%.*]] +; CHECK: backedge.us: +; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 +; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[CONDITION:%.*]] = icmp eq i32 [[IV]], 123 +; CHECK-NEXT: br i1 [[CONDITION]], label [[GUARD:%.*]], label [[BACKEDGE]] +; CHECK: guard: +; CHECK-NEXT: br label [[DEOPT:%.*]] +; CHECK: deopt: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +; CHECK-NEXT: unreachable +; CHECK: backedge: +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label %loop, label [[EXIT_SPLIT:%.*]] +; + +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ] + %condition = icmp eq i32 %iv, 123 + br i1 %condition, label %guard, label %backedge + +guard: + call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] + br label %backedge + +backedge: + %iv.next = add i32 %iv, 1 + %loop.cond = icmp slt i32 %iv.next, %N + br i1 %loop.cond, label %loop, label %exit + +exit: + ret void +} + +define void @test_nested_loop(i1 %cond, i32 %N) { +; CHECK-LABEL: @test_nested_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[ENTRY_SPLIT:%.*]], label [[OUTER_LOOP_SPLIT:%.*]] +; CHECK: entry.split: +; CHECK-NEXT: br label [[OUTER_LOOP:%.*]] +; CHECK: outer_loop: +; CHECK-NEXT: br label [[OUTER_LOOP_SPLIT_US:%.*]] +; CHECK: outer_loop.split.us: +; CHECK-NEXT: br label [[LOOP_US:%.*]] +; CHECK: loop.us: +; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, [[OUTER_LOOP_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ] +; CHECK-NEXT: br label [[GUARDED_US]] +; CHECK: guarded.us: +; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 +; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[OUTER_BACKEDGE_SPLIT_US:%.*]] +; CHECK: outer_backedge.split.us: +; CHECK-NEXT: br label [[OUTER_BACKEDGE:%.*]] +; CHECK: deopt: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +; CHECK-NEXT: unreachable +; CHECK: outer_backedge: +; CHECK-NEXT: br i1 false, label [[OUTER_LOOP]], label [[EXIT:%.*]] +; + +entry: + br label %outer_loop + +outer_loop: + br label %loop + +loop: + %iv = phi i32 [ 0, %outer_loop ], [ %iv.next, %loop ] + call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] + %iv.next = add i32 %iv, 1 + %loop.cond = icmp slt i32 %iv.next, %N + br i1 %loop.cond, label %loop, label %outer_backedge + +outer_backedge: + br i1 undef, label %outer_loop, label %exit + +exit: + ret void +} + +define void @test_sibling_loops(i1 %cond1, i1 %cond2, i32 %N) { +; CHECK-LABEL: @test_sibling_loops( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] +; CHECK: [[IV1_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], [[GUARDED_US:%.*]] ] +; CHECK-NEXT: br label [[GUARDED_US]] +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +; CHECK-NEXT: unreachable +; CHECK: [[IV2_US:%.*]] = phi i32 [ 0, [[BETWEEN:%.*]] ], [ [[IV1_NEXT_US2:%.*]], [[GUARDED_US2:%.*]] ] +; CHECK-NEXT: br label [[GUARDED_US2]] +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +; CHECK-NEXT: unreachable +; + +entry: + br label %loop1 + +loop1: + %iv1 = phi i32 [ 0, %entry ], [ %iv1.next, %loop1 ] + call void (i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ] + %iv1.next = add i32 %iv1, 1 + %loop1.cond = icmp slt i32 %iv1.next, %N + br i1 %loop1.cond, label %loop1, label %between + +between: + br label %loop2 + +loop2: + %iv2 = phi i32 [ 0, %between ], [ %iv2.next, %loop2 ] + call void (i1, ...) @llvm.experimental.guard(i1 %cond2) [ "deopt"() ] + %iv2.next = add i32 %iv2, 1 + %loop2.cond = icmp slt i32 %iv2.next, %N + br i1 %loop2.cond, label %loop2, label %exit + +exit: + ret void +} + +; Check that we don't do anything because of cleanuppad. +; CHECK-LABEL: @test_cleanuppad( +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] +; CHECK-NOT: call void (i1, ...) @llvm.experimental.guard( +define void @test_cleanuppad(i1 %cond, i32 %N) personality i32 (...)* @__CxxFrameHandler3 { + +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] + %iv.next = add i32 %iv, 1 + invoke void @may_throw(i32 %iv) to label %loop unwind label %exit + +exit: + %cp = cleanuppad within none [] + cleanupret from %cp unwind to caller + +} + +declare void @may_throw(i32 %i) +declare i32 @__CxxFrameHandler3(...)