diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -73,6 +73,7 @@ STATISTIC(NumBranches, "Number of branches unswitched"); STATISTIC(NumSwitches, "Number of switches unswitched"); +STATISTIC(NumSelects, "Number of selects turned into branches for unswitching"); STATISTIC(NumGuards, "Number of guards turned into branches for unswitching"); STATISTIC(NumTrivial, "Number of unswitches that are trivial"); STATISTIC( @@ -2584,6 +2585,55 @@ return Cost; } +/// Turns a select instruction into implicit control flow branch, +/// making the following replacement: +/// +/// head: +/// --code before select-- +/// select %cond, %trueval, %falseval +/// --code after select-- +/// +/// into +/// +/// head: +/// --code before select-- +/// br i1 %cond, label %then, label %tail +/// +/// then: +/// br %tail +/// +/// tail: +/// phi [ %trueval, %then ], [ %falseval, %head] +/// unreachable +/// +/// It also makes all relevant DT and LI updates, so that all structures are in +/// valid state after this transform. +static BranchInst *turnSelectIntoBranch(SelectInst *SI, DominatorTree &DT, + LoopInfo &LI, MemorySSAUpdater *MSSAU) { + LLVM_DEBUG(dbgs() << "Turning " << *SI << " into a branch.\n"); + BasicBlock *HeadBB = SI->getParent(); + + SplitBlockAndInsertIfThen(SI->getCondition(), SI, false, + SI->getMetadata(LLVMContext::MD_prof), &DT, &LI); + auto *CondBr = cast(HeadBB->getTerminator()); + BasicBlock *ThenBB = CondBr->getSuccessor(0), + *TailBB = CondBr->getSuccessor(1); + if (MSSAU) + MSSAU->moveAllAfterSpliceBlocks(HeadBB, TailBB, SI); + + PHINode *Phi = PHINode::Create(SI->getType(), 2, "", SI); + Phi->addIncoming(SI->getTrueValue(), ThenBB); + Phi->addIncoming(SI->getFalseValue(), HeadBB); + SI->replaceAllUsesWith(Phi); + SI->eraseFromParent(); + + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + + ++NumSelects; + return CondBr; +} + /// Turns a llvm.experimental.guard intrinsic into implicit control flow branch, /// making the following replacement: /// @@ -2605,25 +2655,17 @@ /// /// It also makes all relevant DT and LI updates, so that all structures are in /// valid state after this transform. -static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, - DominatorTree &DT, LoopInfo &LI, - MemorySSAUpdater *MSSAU) { - SmallVector DTUpdates; +static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, DominatorTree &DT, + LoopInfo &LI, MemorySSAUpdater *MSSAU) { LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n"); BasicBlock *CheckBB = GI->getParent(); if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); - // Remove all CheckBB's successors from DomTree. A block can be seen among - // successors more than once, but for DomTree it should be added only once. - SmallPtrSet Successors; - for (auto *Succ : successors(CheckBB)) - if (Successors.insert(Succ).second) - DTUpdates.push_back({DominatorTree::Delete, CheckBB, Succ}); - - Instruction *DeoptBlockTerm = - SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true); + Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen( + GI->getArgOperand(0), GI, true, GI->getMetadata(LLVMContext::MD_prof), + &DT, &LI); BranchInst *CheckBI = cast(CheckBB->getTerminator()); // SplitBlockAndInsertIfThen inserts control flow that branches to // DeoptBlockTerm if the condition is true. We want the opposite. @@ -2640,20 +2682,6 @@ GI->moveBefore(DeoptBlockTerm); GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext())); - // Add new successors of CheckBB into DomTree. - for (auto *Succ : successors(CheckBB)) - DTUpdates.push_back({DominatorTree::Insert, CheckBB, Succ}); - - // Now the blocks that used to be CheckBB's successors are GuardedBlock's - // successors. - for (auto *Succ : Successors) - DTUpdates.push_back({DominatorTree::Insert, GuardedBlock, Succ}); - - // Make proper changes to DT. - DT.applyUpdates(DTUpdates); - // Inform LI of a new loop block. - L.addBasicBlockToLoop(GuardedBlock, LI); - if (MSSAU) { MemoryDef *MD = cast(MSSAU->getMemorySSA()->getMemoryAccess(GI)); MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::BeforeTerminator); @@ -2682,7 +2710,6 @@ const Instruction &TI, const Loop &L, const LoopInfo &LI, const DominatorTree &DT, ArrayRef UnswitchCandidates) { - // Guards and other exiting conditions do not contribute to exponential // explosion as soon as they dominate the latch (otherwise there might be // another path to the latch remaining that does not allow to eliminate the @@ -2691,9 +2718,10 @@ const BasicBlock *CondBlock = TI.getParent(); if (DT.dominates(CondBlock, Latch) && (isGuard(&TI) || - llvm::count_if(successors(&TI), [&L](const BasicBlock *SuccBB) { - return L.contains(SuccBB); - }) <= 1)) { + (TI.isTerminator() && + llvm::count_if(successors(&TI), [&L](const BasicBlock *SuccBB) { + return L.contains(SuccBB); + }) <= 1))) { NumCostMultiplierSkipped++; return 1; } @@ -2702,12 +2730,17 @@ int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size() : std::distance(LI.begin(), LI.end())); // Count amount of clones that all the candidates might cause during - // unswitching. Branch/guard counts as 1, switch counts as log2 of its cases. + // unswitching. Branch/guard/select counts as 1, switch counts as log2 of its + // cases. int UnswitchedClones = 0; for (auto Candidate : UnswitchCandidates) { const Instruction *CI = Candidate.TI; const BasicBlock *CondBlock = CI->getParent(); bool SkipExitingSuccessors = DT.dominates(CondBlock, Latch); + if (isa(CI)) { + UnswitchedClones++; + continue; + } if (isGuard(CI)) { if (!SkipExitingSuccessors) UnswitchedClones++; @@ -2770,15 +2803,21 @@ if (LI.getLoopFor(BB) != &L) continue; - if (CollectGuards) - for (auto &I : *BB) - if (isGuard(&I)) { - auto *Cond = - skipTrivialSelect(cast(&I)->getArgOperand(0)); - // TODO: Support AND, OR conditions and partial unswitching. - if (!isa(Cond) && L.isLoopInvariant(Cond)) - UnswitchCandidates.push_back({&I, {Cond}}); - } + for (auto &I : *BB) { + if (auto *SI = dyn_cast(&I)) { + if (SI->getType()->isIntegerTy(1)) + continue; + auto *Cond = SI->getCondition(); + if (!isa(Cond) && L.isLoopInvariant(Cond)) + UnswitchCandidates.push_back({&I, {Cond}}); + } else if (CollectGuards && isGuard(&I)) { + auto *Cond = + skipTrivialSelect(cast(&I)->getArgOperand(0)); + // TODO: Support AND, OR conditions and partial unswitching. + if (!isa(Cond) && L.isLoopInvariant(Cond)) + UnswitchCandidates.push_back({&I, {Cond}}); + } + } if (auto *SI = dyn_cast(BB->getTerminator())) { // We can only consider fully loop-invariant switch conditions as we need @@ -2983,7 +3022,8 @@ // loop. This is computing the new cost of unswitching a condition. // Note that guards always have 2 unique successors that are implicit and // will be materialized if we decide to unswitch it. - int SuccessorsCount = isGuard(&TI) ? 2 : Visited.size(); + int SuccessorsCount = + isGuard(&TI) || isa(TI) ? 2 : Visited.size(); assert(SuccessorsCount > 1 && "Cannot unswitch a condition without multiple distinct successors!"); return (LoopCost - Cost) * (SuccessorsCount - 1); @@ -3059,10 +3099,11 @@ if (Best.TI != PartialIVCondBranch) PartialIVInfo.InstToDuplicate.clear(); - // If the best candidate is a guard, turn it into a branch. - if (isGuard(Best.TI)) - Best.TI = - turnGuardIntoBranch(cast(Best.TI), L, DT, LI, MSSAU); + // If the best candidate is a select or a guard, turn it into a branch. + if (auto *SI = dyn_cast(Best.TI)) + Best.TI = turnSelectIntoBranch(SI, DT, LI, MSSAU); + else if (isGuard(Best.TI)) + Best.TI = turnGuardIntoBranch(cast(Best.TI), DT, LI, MSSAU); LLVM_DEBUG(dbgs() << " Unswitching non-trivial (cost = " << Best.Cost << ") terminator: " << *Best.TI << "\n"); diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll --- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll @@ -4388,3 +4388,77 @@ ; CHECK: loop_exit: ; CHECK-NEXT: ret } + +; Non-trivial loop unswitching of select instruction. +define i32 @test34(i32 %N, i1 %cond) { +; CHECK-LABEL: @test34( +entry: + br label %for.cond +; CHECK-NEXT: entry: +; CHECK-NEXT: %cond.fr = freeze i1 %cond +; CHECK-NEXT: br i1 %cond.fr, label %entry.split.us, label %entry.split + +for.cond: ; preds = %for.body, %entry + %res = phi i32 [ 0, %entry ], [ %add, %for.body ] + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %i, %N + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %for.cond + %cond1 = select i1 %cond, i32 %i, i32 42 + %add = add nuw nsw i32 %cond1, %res + %inc = add nuw nsw i32 %i, 1 + br label %for.cond + +; CHECK: entry.split.us: +; CHECK-NEXT: br label %for.cond.us + +; CHECK: for.cond.us: +; CHECK-NEXT: %[[RES_US_PHI:.*]] = phi i32 [ 0, %entry.split.us ], [ %[[ADD_US:.*]], %[[TAIL_US:.*]] ] +; CHECK-NEXT: %[[I_US:.*]] = phi i32 [ 0, %entry.split.us ], [ %[[INC_US:.*]], %[[TAIL_US]] ] +; CHECK-NEXT: %[[CMP_US:.*]] = icmp slt i32 %[[I_US]], %N +; CHECK-NEXT: br i1 %[[CMP_US]], label %for.body.us, label %for.cond.cleanup.split.us + +; CHECK: for.body.us: +; CHECK-NEXT: br label %[[THEN:.*]] + +; CHECK: [[THEN]]: +; CHECK-NEXT: br label %[[TAIL_US]] + +; CHECK: [[TAIL_US]]: +; CHECK-NEXT: %[[SEL_PHI:.*]] = phi i32 [ %[[I_US]], %[[THEN]] ] +; CHECK-NEXT: %[[ADD_US]] = add nuw nsw i32 %[[SEL_PHI]], %[[RES_US_PHI]] +; CHECK-NEXT: %[[INC_US]] = add nuw nsw i32 %[[I_US]], 1 +; CHECK-NEXT: br label %for.cond.us + +; CHECK: for.cond.cleanup.split.us: +; CHECK-NEXT: %[[RES_US:.*]] = phi i32 [ %[[RES_US_PHI]], %for.cond.us ] +; CHECK-NEXT: br label %for.cond.cleanup + +; CHECK: entry.split: +; CHECK-NEXT: br label %for.cond + +; CHECK: for.cond: +; CHECK-NEXT: %[[RES_PHI:.*]] = phi i32 [ 0, %entry.split ], [ %[[ADD:.*]], %[[TAIL:.*]] ] +; CHECK-NEXT: %[[I:.*]] = phi i32 [ 0, %entry.split ], [ %[[INC:.*]], %[[TAIL]] ] +; CHECK-NEXT: %[[CMP:.*]] = icmp slt i32 %[[I]], %N +; CHECK-NEXT: br i1 %[[CMP]], label %for.body, label %for.cond.cleanup.split + +; CHECK: for.body: +; CHECK-NEXT: br label %[[TAIL]] + +; CHECK: [[TAIL]]: +; CHECK-NEXT: %[[ADD]] = add nuw nsw i32 42, %[[RES_PHI]] +; CHECK-NEXT: %[[INC]] = add nuw nsw i32 %[[I]], 1 +; CHECK-NEXT: br label %for.cond + +; CHECK: for.cond.cleanup.split: +; CHECK-NEXT: %[[RES:.*]] = phi i32 [ %[[RES_PHI]], %for.cond ] +; CHECK-NEXT: br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + ret i32 %res +; CHECK: for.cond.cleanup: +; CHECK-NEXT: %[[US_PHI:.*]] = phi i32 [ %[[RES]], %for.cond.cleanup.split ], [ %[[RES_US]], %for.cond.cleanup.split.us ] +; CHECK-NEXT: ret i32 %[[US_PHI]] +}