diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h --- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h +++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h @@ -136,6 +136,9 @@ RecursionSet, CxtI); } + Constant *EvaluateOnPredecessorEdge(BasicBlock *BB, BasicBlock *PredPredBB, + Value *cond); + bool MaybeCloneThreadableBasicBlock(BasicBlock *BB, Value *Cond); bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB, jumpthreading::ConstantPreference Preference, Instruction *CxtI = nullptr); diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1549,6 +1549,52 @@ return MostPopularDest; } +// Try to evaluate the value of V when the control flows from PredPredBB to +// BB->getSingleSuccessor() and then on to BB. +Constant *JumpThreadingPass::EvaluateOnPredecessorEdge(BasicBlock *BB, + BasicBlock *PredPredBB, + Value *V) { + BasicBlock *PredBB = BB->getSinglePredecessor(); + assert(PredBB && "Expected a single predecessor"); + + if (Constant *Cst = dyn_cast(V)) { + return Cst; + } + + // Consult LVI if V is not an instruction in BB or PredBB. + Instruction *I = dyn_cast(V); + if (!I || (I->getParent() != BB && I->getParent() != PredBB)) { + if (DTU->hasPendingDomTreeUpdates()) + LVI->disableDT(); + else + LVI->enableDT(); + return LVI->getConstantOnEdge(V, PredPredBB, PredBB, nullptr); + } + + // Look into a PHI argument. + if (PHINode *PHI = dyn_cast(V)) { + if (PHI->getParent() == PredBB) + return dyn_cast(PHI->getIncomingValueForBlock(PredPredBB)); + return nullptr; + } + + // If we have a CmpInst, try to fold it for each incoming edge into PredBB. + if (CmpInst *CondCmp = dyn_cast(V)) { + if (CondCmp->getParent() == BB) { + Constant *Op0 = + EvaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0)); + Constant *Op1 = + EvaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1)); + if (Op0 && Op1) { + return ConstantExpr::getCompare(CondCmp->getPredicate(), Op0, Op1); + } + } + return nullptr; + } + + return nullptr; +} + bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, ConstantPreference Preference, Instruction *CxtI) { @@ -1558,8 +1604,14 @@ return false; PredValueInfoTy PredValues; - if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference, CxtI)) + if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference, + CxtI)) { + // We don't have known values in predecessors. Try duplicating one of the + // predecessors to increase jump threading opportunities. + if (MaybeCloneThreadableBasicBlock(BB, Cond)) + return true; return false; + } assert(!PredValues.empty() && "ComputeValueKnownInPredecessors returned true with no values"); @@ -2016,6 +2068,145 @@ return ValueMapping; } +/// Attempt to clone the sole predecessor of BB if doing so increases jump +/// threading opportunities. +bool JumpThreadingPass::MaybeCloneThreadableBasicBlock(BasicBlock *BB, + Value *Cond) { + // Consider: + // + // PredBB: + // %var = phi i32* [ null, %bb1 ], [ @a, %bb2 ] + // %tobool = icmp eq i32 %cond, 0 + // br i1 %tobool, label %BB, label ... + // + // BB: + // %cmp = icmp eq i32* %var, null + // br i1 %cmp, label ..., label ... + // + // We don't know the value of %var at BB even if we know which incoming edge + // we take to BB. However, once we duplicate PredBB for each of its incoming + // edges (say, PredBB1 and PredBB2), we know the value of %var in each copy of + // PredBB. Then we can thread edges PredBB1->BB and PredBB2->BB through BB. + // + // This function simply duplicates a basic block like PredBB to increase jump + // threading opportunities. + + // Require that BB end with a Branch for simplicity. + if (!isa(BB->getTerminator())) + return false; + + // BB must have exactly one predecessor. + BasicBlock *PredBB = BB->getSinglePredecessor(); + if (!PredBB) + return false; + + // Require that PredBB end with a Branch. If PredBB ends with an + // unconditional branch, we should be merging PredBB and BB instead. For + // simplicity, we don't deal with a switch. + BranchInst *PredBBBranch = dyn_cast(PredBB->getTerminator()); + if (!PredBBBranch) + return false; + + // If PredBB has exactly one incoming edge, we don't gain anything by copying + // PredBB. + if (PredBB->getSinglePredecessor()) + return false; + + // Don't thread across a loop header. + if (LoopHeaders.count(PredBB)) + return false; + + // Find a predecessor that we can thread. For simplicity, we only consider a + // successor edge out of BB to which we thread exactly one incoming edge into + // PredBB. + unsigned ZeroCount = 0; + unsigned OneCount = 0; + BasicBlock *ZeroPred = nullptr; + BasicBlock *OnePred = nullptr; + for (BasicBlock *P : predecessors(PredBB)) { + if (Constant *Cst = EvaluateOnPredecessorEdge(BB, P, Cond)) { + if (Cst->isZeroValue()) { + ZeroCount++; + ZeroPred = P; + } else { + OneCount++; + OnePred = P; + } + } + } + + // Disregard complicated cases where we have to thread multiple edges. + BasicBlock *PredPredBB; + if (ZeroCount == 1) { + PredPredBB = ZeroPred; + } else if (OneCount == 1) { + PredPredBB = OnePred; + } else { + return false; + } + + // Check the cost of duplicating BB and PredBB. + unsigned JumpThreadCost = + getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); + JumpThreadCost += getJumpThreadDuplicationCost( + PredBB, PredBB->getTerminator(), BBDupThreshold); + if (JumpThreadCost > BBDupThreshold) { + LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName() + << "' - Cost is too high: " << JumpThreadCost << "\n"); + return false; + } + + // Now we are ready to duplicate PredBB. + + BasicBlock *NewBB = + BasicBlock::Create(PredBB->getContext(), PredBB->getName() + ".thread", + PredBB->getParent(), PredBB); + NewBB->moveAfter(PredBB); + + // Set the block frequency of NewBB. + if (HasProfileData) { + auto NewBBFreq = BFI->getBlockFreq(PredPredBB) * + BPI->getEdgeProbability(PredPredBB, PredBB); + BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); + } + + // We are going to have to map operands from the original BB block to the new + // copy of the block 'NewBB'. If there are PHI nodes in PredBB, evaluate them + // to account for entry from PredPredBB. + DenseMap ValueMapping = + CloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB); + + // Update the terminator of PredPredBB to jump to NewBB instead of PredBB. + // This eliminates predecessors from PredPredBB, which requires us to simplify + // any PHI nodes in PredBB. + Instruction *PredPredTerm = PredPredBB->getTerminator(); + for (unsigned i = 0, e = PredPredTerm->getNumSuccessors(); i != e; ++i) + if (PredPredTerm->getSuccessor(i) == PredBB) { + PredBB->removePredecessor(PredPredBB, true); + PredPredTerm->setSuccessor(i, NewBB); + } + + AddPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(0), PredBB, NewBB, + ValueMapping); + AddPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(1), PredBB, NewBB, + ValueMapping); + + BranchInst *CondBr = dyn_cast(BB->getTerminator()); + DTU->applyUpdatesPermissive( + {{DominatorTree::Insert, NewBB, CondBr->getSuccessor(0)}, + {DominatorTree::Insert, NewBB, CondBr->getSuccessor(1)}, + {DominatorTree::Insert, PredPredBB, NewBB}, + {DominatorTree::Delete, PredPredBB, PredBB}}); + + UpdateSSA(PredBB, NewBB, ValueMapping); + + // Merge NewBB and PredBB into their respective sole predecessors. + MaybeMergeBasicBlockIntoOnlyPred(NewBB); + MaybeMergeBasicBlockIntoOnlyPred(PredBB); + + return true; +} + /// ThreadEdge - We have decided that it is safe and profitable to factor the /// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB /// across BB. Transform the IR to reflect this change. diff --git a/llvm/test/Transforms/JumpThreading/thread-two-bbs1.ll b/llvm/test/Transforms/JumpThreading/thread-two-bbs1.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/thread-two-bbs1.ll @@ -0,0 +1,56 @@ +; RUN: opt < %s -jump-threading -S -verify > %t + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = global i32 0, align 4 + +define void @foo(i32 %cond1, i32 %cond2) { +; CHECK-LABEL: @foo +; CHECK-LABEL: entry +entry: + %tobool = icmp eq i32 %cond1, 0 + br i1 %tobool, label %bb.cond2, label %bb.goo + +bb.goo: + call void @goo() + br label %bb.cond2 +; CHECK: call void @goo() +; CHECK-NEXT: icmp eq i32 %cond2, 0 +; CHECK-NEXT: label %bb.moo, label %bb.hoo + +bb.cond2: + %file.0 = phi i32* [ null, %bb.goo ], [ @a, %entry ] + %tobool1 = icmp eq i32 %cond2, 0 + br i1 %tobool1, label %bb.file, label %bb.hoo +; CHECK: icmp eq i32 %cond2, 0 +; CHECK-NEXT: label %bb.koo, label %bb.hoo + +bb.hoo: + call void @hoo() + br label %exit + +; CHECK-NOT: bb.file: +bb.file: + %cmp = icmp eq i32* %file.0, null + br i1 %cmp, label %bb.moo, label %bb.koo + +bb.koo: + call void @koo() + br label %exit + +bb.moo: + call void @moo() + br label %exit + +exit: + ret void +} + +declare void @goo() + +declare void @hoo() + +declare void @koo() + +declare void @moo() diff --git a/llvm/test/Transforms/JumpThreading/thread-two-bbs2.ll b/llvm/test/Transforms/JumpThreading/thread-two-bbs2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/thread-two-bbs2.ll @@ -0,0 +1,53 @@ +; RUN: opt < %s -jump-threading -S -verify > %t + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @foo(i32 %cond1, i32 %cond2) { +; CHECK-LABEL: @foo +; CHECK-LABEL: entry +entry: + %tobool = icmp ne i32 %cond1, 0 + br i1 %tobool, label %bb.f1, label %bb.f2 + +bb.f1: + call void @f1() + br label %bb.cond2 +; CHECK call void @f1() +; CHECK-NEXT: icmp eq i32 %cond2, 0 +; CHECK-NEXT: label %exit, label %bb.f3 + +bb.f2: + call void @f2() + br label %bb.cond2 +; CHECK call void @f2() +; CHECK-NEXT: icmp eq i32 %cond2, 0 +; CHECK-NEXT: label %exit, label %bb.f4 + +bb.cond2: + %tobool1 = icmp eq i32 %cond2, 0 + br i1 %tobool1, label %exit, label %bb.cond1again + +; CHECK-NOT: bb.cond1again: +bb.cond1again: + br i1 %tobool, label %bb.f3, label %bb.f4 + +bb.f3: + call void @f3() + br label %exit + +bb.f4: + call void @f4() + br label %exit + +exit: + ret void +} + +declare void @f1() local_unnamed_addr + +declare void @f2() local_unnamed_addr + +declare void @f3() local_unnamed_addr + +declare void @f4() local_unnamed_addr