diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h --- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h +++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h @@ -139,6 +139,11 @@ RecursionSet, CxtI); } + Constant *EvaluateOnPredecessorEdge(BasicBlock *BB, BasicBlock *PredPredBB, + Value *cond); + bool MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB, Value *Cond); + void ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB, BasicBlock *PredBB, + BasicBlock *BB, BasicBlock *SuccBB); bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB, jumpthreading::ConstantPreference Preference, Instruction *CxtI = nullptr); diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1548,6 +1548,52 @@ return MostPopularDest; } +// Try to evaluate the value of V when the control flows from PredPredBB to +// BB->getSinglePredecessor() and then on to BB. +Constant *JumpThreadingPass::EvaluateOnPredecessorEdge(BasicBlock *BB, + BasicBlock *PredPredBB, + Value *V) { + BasicBlock *PredBB = BB->getSinglePredecessor(); + assert(PredBB && "Expected a single predecessor"); + + if (Constant *Cst = dyn_cast(V)) { + return Cst; + } + + // Consult LVI if V is not an instruction in BB or PredBB. + Instruction *I = dyn_cast(V); + if (!I || (I->getParent() != BB && I->getParent() != PredBB)) { + if (DTU->hasPendingDomTreeUpdates()) + LVI->disableDT(); + else + LVI->enableDT(); + return LVI->getConstantOnEdge(V, PredPredBB, PredBB, nullptr); + } + + // Look into a PHI argument. + if (PHINode *PHI = dyn_cast(V)) { + if (PHI->getParent() == PredBB) + return dyn_cast(PHI->getIncomingValueForBlock(PredPredBB)); + return nullptr; + } + + // If we have a CmpInst, try to fold it for each incoming edge into PredBB. + if (CmpInst *CondCmp = dyn_cast(V)) { + if (CondCmp->getParent() == BB) { + Constant *Op0 = + EvaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0)); + Constant *Op1 = + EvaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1)); + if (Op0 && Op1) { + return ConstantExpr::getCompare(CondCmp->getPredicate(), Op0, Op1); + } + } + return nullptr; + } + + return nullptr; +} + bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, ConstantPreference Preference, Instruction *CxtI) { @@ -1557,8 +1603,12 @@ return false; PredValueInfoTy PredValues; - if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference, CxtI)) - return false; + if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference, + CxtI)) { + // We don't have known values in predecessors. See if we can thread through + // BB and its sole predecessor. + return MaybeThreadThroughTwoBasicBlocks(BB, Cond); + } assert(!PredValues.empty() && "ComputeValueKnownInPredecessors returned true with no values"); @@ -2015,6 +2065,186 @@ return ValueMapping; } +/// Attempt to thread through two successive basic blocks. +bool JumpThreadingPass::MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB, + Value *Cond) { + // Consider: + // + // PredBB: + // %var = phi i32* [ null, %bb1 ], [ @a, %bb2 ] + // %tobool = icmp eq i32 %cond, 0 + // br i1 %tobool, label %BB, label ... + // + // BB: + // %cmp = icmp eq i32* %var, null + // br i1 %cmp, label ..., label ... + // + // We don't know the value of %var at BB even if we know which incoming edge + // we take to BB. However, once we duplicate PredBB for each of its incoming + // edges (say, PredBB1 and PredBB2), we know the value of %var in each copy of + // PredBB. Then we can thread edges PredBB1->BB and PredBB2->BB through BB. + + // Require that BB end with a Branch for simplicity. + BranchInst *CondBr = dyn_cast(BB->getTerminator()); + if (!CondBr) + return false; + + // BB must have exactly one predecessor. + BasicBlock *PredBB = BB->getSinglePredecessor(); + if (!PredBB) + return false; + + // Require that PredBB end with a Branch. If PredBB ends with an + // unconditional branch, we should be merging PredBB and BB instead. For + // simplicity, we don't deal with a switch. + BranchInst *PredBBBranch = dyn_cast(PredBB->getTerminator()); + if (!PredBBBranch) + return false; + + // If PredBB has exactly one incoming edge, we don't gain anything by copying + // PredBB. + if (PredBB->getSinglePredecessor()) + return false; + + // Don't thread across a loop header. + if (LoopHeaders.count(PredBB)) + return false; + + // Avoid complication with duplicating EH pads. + if (PredBB->isEHPad()) + return false; + + // Find a predecessor that we can thread. For simplicity, we only consider a + // successor edge out of BB to which we thread exactly one incoming edge into + // PredBB. + unsigned ZeroCount = 0; + unsigned OneCount = 0; + BasicBlock *ZeroPred = nullptr; + BasicBlock *OnePred = nullptr; + for (BasicBlock *P : predecessors(PredBB)) { + if (Constant *Cst = EvaluateOnPredecessorEdge(BB, P, Cond)) { + if (Cst->isZeroValue()) { + ZeroCount++; + ZeroPred = P; + } else { + OneCount++; + OnePred = P; + } + } + } + + // Disregard complicated cases where we have to thread multiple edges. + BasicBlock *PredPredBB; + if (ZeroCount == 1) { + PredPredBB = ZeroPred; + } else if (OneCount == 1) { + PredPredBB = OnePred; + } else { + return false; + } + + BasicBlock *SuccBB = CondBr->getSuccessor(PredPredBB == ZeroPred); + + // If threading to the same block as we come from, we would infinite loop. + if (SuccBB == BB) { + LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName() + << "' - would thread to self!\n"); + return false; + } + + // If threading this would thread across a loop header, don't thread the edge. + // See the comments above FindLoopHeaders for justifications and caveats. + if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) { + LLVM_DEBUG({ + bool BBIsHeader = LoopHeaders.count(BB); + bool SuccIsHeader = LoopHeaders.count(SuccBB); + dbgs() << " Not threading across " + << (BBIsHeader ? "loop header BB '" : "block BB '") + << BB->getName() << "' to dest " + << (SuccIsHeader ? "loop header BB '" : "block BB '") + << SuccBB->getName() + << "' - it might create an irreducible loop!\n"; + }); + return false; + } + + // Check the cost of duplicating BB and PredBB. + unsigned JumpThreadCost = + getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); + JumpThreadCost += getJumpThreadDuplicationCost( + PredBB, PredBB->getTerminator(), BBDupThreshold); + if (JumpThreadCost > BBDupThreshold) { + LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName() + << "' - Cost is too high: " << JumpThreadCost << "\n"); + return false; + } + + // Now we are ready to duplicate PredBB. + ThreadThroughTwoBasicBlocks(PredPredBB, PredBB, BB, SuccBB); + return true; +} + +void JumpThreadingPass::ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB, + BasicBlock *PredBB, + BasicBlock *BB, + BasicBlock *SuccBB) { + LLVM_DEBUG(dbgs() << " Threading through '" << PredBB->getName() << "' and '" + << BB->getName() << "'\n"); + + BranchInst *CondBr = cast(BB->getTerminator()); + BranchInst *PredBBBranch = cast(PredBB->getTerminator()); + + BasicBlock *NewBB = + BasicBlock::Create(PredBB->getContext(), PredBB->getName() + ".thread", + PredBB->getParent(), PredBB); + NewBB->moveAfter(PredBB); + + // Set the block frequency of NewBB. + if (HasProfileData) { + auto NewBBFreq = BFI->getBlockFreq(PredPredBB) * + BPI->getEdgeProbability(PredPredBB, PredBB); + BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); + } + + // We are going to have to map operands from the original BB block to the new + // copy of the block 'NewBB'. If there are PHI nodes in PredBB, evaluate them + // to account for entry from PredPredBB. + DenseMap ValueMapping = + CloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB); + + // Update the terminator of PredPredBB to jump to NewBB instead of PredBB. + // This eliminates predecessors from PredPredBB, which requires us to simplify + // any PHI nodes in PredBB. + Instruction *PredPredTerm = PredPredBB->getTerminator(); + for (unsigned i = 0, e = PredPredTerm->getNumSuccessors(); i != e; ++i) + if (PredPredTerm->getSuccessor(i) == PredBB) { + PredBB->removePredecessor(PredPredBB, true); + PredPredTerm->setSuccessor(i, NewBB); + } + + AddPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(0), PredBB, NewBB, + ValueMapping); + AddPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(1), PredBB, NewBB, + ValueMapping); + + DTU->applyUpdatesPermissive( + {{DominatorTree::Insert, NewBB, CondBr->getSuccessor(0)}, + {DominatorTree::Insert, NewBB, CondBr->getSuccessor(1)}, + {DominatorTree::Insert, PredPredBB, NewBB}, + {DominatorTree::Delete, PredPredBB, PredBB}}); + + UpdateSSA(PredBB, NewBB, ValueMapping); + + // Clean up things like PHI nodes with single operands, dead instructions, + // etc. + SimplifyInstructionsInBlock(NewBB, TLI); + SimplifyInstructionsInBlock(PredBB, TLI); + + SmallVector PredsToFactor; + PredsToFactor.push_back(NewBB); + ThreadEdge(BB, PredsToFactor, SuccBB); +} + /// TryThreadEdge - Thread an edge if it's safe and profitable to do so. bool JumpThreadingPass::TryThreadEdge( BasicBlock *BB, const SmallVectorImpl &PredBBs, diff --git a/llvm/test/Transforms/JumpThreading/thread-two-bbs1.ll b/llvm/test/Transforms/JumpThreading/thread-two-bbs1.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/thread-two-bbs1.ll @@ -0,0 +1,59 @@ +; RUN: opt < %s -jump-threading -S -verify | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = global i32 0, align 4 + +define void @foo(i32 %cond1, i32 %cond2) { +; CHECK-LABEL: @foo +; CHECK-LABEL: entry +entry: + %tobool = icmp eq i32 %cond1, 0 + br i1 %tobool, label %bb.cond2, label %bb.f1 + +bb.f1: + call void @f1() + br label %bb.cond2 +; Verify that we branch on cond2 without checking ptr. +; CHECK: call void @f1() +; CHECK-NEXT: icmp eq i32 %cond2, 0 +; CHECK-NEXT: label %bb.f4, label %bb.f2 + +bb.cond2: + %ptr = phi i32* [ null, %bb.f1 ], [ @a, %entry ] + %tobool1 = icmp eq i32 %cond2, 0 + br i1 %tobool1, label %bb.file, label %bb.f2 +; Verify that we branch on cond2 without checking ptr. +; CHECK: icmp eq i32 %cond2, 0 +; CHECK-NEXT: label %bb.f3, label %bb.f2 + +bb.f2: + call void @f2() + br label %exit + +; Verify that we eliminate this basic block. +; CHECK-NOT: bb.file: +bb.file: + %cmp = icmp eq i32* %ptr, null + br i1 %cmp, label %bb.f4, label %bb.f3 + +bb.f3: + call void @f3() + br label %exit + +bb.f4: + call void @f4() + br label %exit + +exit: + ret void +} + +declare void @f1() + +declare void @f2() + +declare void @f3() + +declare void @f4() diff --git a/llvm/test/Transforms/JumpThreading/thread-two-bbs2.ll b/llvm/test/Transforms/JumpThreading/thread-two-bbs2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/thread-two-bbs2.ll @@ -0,0 +1,56 @@ +; RUN: opt < %s -jump-threading -S -verify | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @foo(i32 %cond1, i32 %cond2) { +; CHECK-LABEL: @foo +; CHECK-LABEL: entry +entry: + %tobool = icmp ne i32 %cond1, 0 + br i1 %tobool, label %bb.f1, label %bb.f2 + +bb.f1: + call void @f1() + br label %bb.cond2 +; Verify that we branch on cond2 without checking tobool again. +; CHECK: call void @f1() +; CHECK-NEXT: icmp eq i32 %cond2, 0 +; CHECK-NEXT: label %exit, label %bb.f3 + +bb.f2: + call void @f2() + br label %bb.cond2 +; Verify that we branch on cond2 without checking tobool again. +; CHECK: call void @f2() +; CHECK-NEXT: icmp eq i32 %cond2, 0 +; CHECK-NEXT: label %exit, label %bb.f4 + +bb.cond2: + %tobool1 = icmp eq i32 %cond2, 0 + br i1 %tobool1, label %exit, label %bb.cond1again + +; Verify that we eliminate this basic block. +; CHECK-NOT: bb.cond1again: +bb.cond1again: + br i1 %tobool, label %bb.f3, label %bb.f4 + +bb.f3: + call void @f3() + br label %exit + +bb.f4: + call void @f4() + br label %exit + +exit: + ret void +} + +declare void @f1() local_unnamed_addr + +declare void @f2() local_unnamed_addr + +declare void @f3() local_unnamed_addr + +declare void @f4() local_unnamed_addr diff --git a/llvm/test/Transforms/JumpThreading/thread-two-bbs3.ll b/llvm/test/Transforms/JumpThreading/thread-two-bbs3.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/thread-two-bbs3.ll @@ -0,0 +1,42 @@ +; RUN: opt < %s -O2 -jump-threading -S -verify | FileCheck %s + +target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-windows-msvc19.16.27026" + +define void @foo([2 x i8]* %0) personality i8* bitcast (i32 ()* @baz to i8*) { +; CHECK-LABEL: @foo +; CHECK-LABEL: entry +; Verify that we do *not* thread any edge. +; CHECK-NOT: thread +entry: + invoke void @bar(i8* null) to label %bb_invoke unwind label %bb_cleanup + +bb_invoke: + invoke void @bar(i8* null) to label %bb_exit unwind label %bb_cleanup + +bb_exit: + ret void + +bb_cleanup: + %index = phi i64 [ 1, %bb_invoke ], [ 0, %entry ] + %cond1 = phi i1 [ false, %bb_invoke ], [ true, %entry ] + %1 = cleanuppad within none [] + br i1 %cond1, label %bb_action, label %bb_done + +bb_action: + %ptr = getelementptr inbounds [2 x i8], [2 x i8]* %0, i64 0, i64 0 + %cond2 = icmp eq i64 %index, 0 + br i1 %cond2, label %bb_done, label %bb_loop + +bb_loop: + %elem1 = phi i8* [ %ptr, %bb_action ], [ %elem2, %bb_loop ] + %elem2 = getelementptr inbounds i8, i8* %elem1, i64 -1 + call void @bar(i8* %elem2) + br label %bb_loop + +bb_done: + cleanupret from %1 unwind to caller +} + +declare void @bar(i8*) +declare i32 @baz()