diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h --- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h +++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h @@ -115,9 +115,11 @@ DenseMap CloneInstructions(BasicBlock::iterator BI, BasicBlock::iterator BE, BasicBlock *NewBB, - BasicBlock *PredBB); + BasicBlock *PredBB, + BasicBlock *SuccBB, + Value *PredVal); bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl &PredBBs, - BasicBlock *SuccBB); + BasicBlock *SuccBB, Value *PredVal = nullptr); bool DuplicateCondBranchOnPHIIntoPred( BasicBlock *BB, const SmallVectorImpl &PredBBs); diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1031,6 +1031,10 @@ if (IB->getNumSuccessors() == 0) return false; Condition = IB->getAddress()->stripPointerCasts(); Preference = WantBlockAddress; + } else if (ReturnInst *RetInst = dyn_cast(Terminator)) { + auto *RV = RetInst->getReturnValue(); + if (!RV || !(Condition = dyn_cast(RV))) + return false; } else { return false; // Must be an invoke or callbr. } @@ -1050,7 +1054,7 @@ // If the terminator is branching on an undef, we can pick any of the // successors to branch to. Let GetBestDestForJumpOnUndef decide. - if (isa(Condition)) { + if (isa(Condition) && BB->getTerminator()->getNumSuccessors()) { unsigned BestSucc = GetBestDestForJumpOnUndef(BB); std::vector Updates; @@ -1591,17 +1595,22 @@ Constant *Val = PredValue.first; BasicBlock *DestBB; - if (isa(Val)) - DestBB = nullptr; - else if (BranchInst *BI = dyn_cast(BB->getTerminator())) { + auto *TI = BB->getTerminator(); + if (isa(Val) && TI->getNumSuccessors() != 0) + DestBB = TI->getSuccessor(GetBestDestForJumpOnUndef(BB)); + else if (BranchInst *BI = dyn_cast(TI)) { assert(isa(Val) && "Expecting a constant integer"); DestBB = BI->getSuccessor(cast(Val)->isZero()); - } else if (SwitchInst *SI = dyn_cast(BB->getTerminator())) { + } else if (SwitchInst *SI = dyn_cast(TI)) { assert(isa(Val) && "Expecting a constant integer"); DestBB = SI->findCaseValue(cast(Val))->getCaseSuccessor(); + } else if (ReturnInst *RI = dyn_cast(TI)) { + (void) RI; + assert(RI->getReturnValue() == Cond && + "Expecting comparison result returned"); + DestBB = nullptr; } else { - assert(isa(BB->getTerminator()) - && "Unexpected terminator"); + assert(isa(TI) && "Unexpected terminator"); assert(isa(Val) && "Expecting a constant blockaddress"); DestBB = cast(Val)->getBasicBlock(); } @@ -1671,6 +1680,7 @@ CondInst->getParent() == BB) ReplaceFoldableUses(CondInst, OnlyVal); } + SimplifyInstructionsInBlock(BB, TLI); return true; } } @@ -1699,26 +1709,38 @@ // Now that we know what the most popular destination is, factor all // predecessors that will jump to it into a single predecessor. SmallVector PredsToFactor; - for (const auto &PredToDest : PredToDestList) - if (PredToDest.second == MostPopularDest) { - BasicBlock *Pred = PredToDest.first; - - // This predecessor may be a switch or something else that has multiple - // edges to the block. Factor each of these edges by listing them - // according to # occurrences in PredsToFactor. - for (BasicBlock *Succ : successors(Pred)) - if (Succ == BB) - PredsToFactor.push_back(Pred); + Value *PredVal = nullptr; + if (MostPopularDest) { + for (const auto &PredToDest : PredToDestList) + if (PredToDest.second == MostPopularDest) { + BasicBlock *Pred = PredToDest.first; + + // This predecessor may be a switch or something else that has multiple + // edges to the block. Factor each of these edges by listing them + // according to # occurrences in PredsToFactor. + for (BasicBlock *Succ : successors(Pred)) + if (Succ == BB) + PredsToFactor.push_back(Pred); + } + } else { + LLVM_DEBUG(auto *TI = BB->getTerminator(); + assert(isa(TI));); + auto *Pred = PredToDestList.begin()->first; + PredsToFactor.push_back(Pred); + // We need to find the pred val associated with the selected pred: + for (const auto &PredValue : PredValues) { + if (PredValue.second == Pred) { + PredVal = PredValue.first; + break; + } } - - // If the threadable edges are branching on an undefined value, we get to pick - // the destination that these predecessors should get to. - if (!MostPopularDest) - MostPopularDest = BB->getTerminator()-> - getSuccessor(GetBestDestForJumpOnUndef(BB)); + // FIXME: add support for factoring predecessors with + // the same PredVal + assert(PredVal && "non null pred val expected!"); + } // Ok, try to thread it! - return ThreadEdge(BB, PredsToFactor, MostPopularDest); + return ThreadEdge(BB, PredsToFactor, MostPopularDest, PredVal); } /// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on @@ -1980,7 +2002,9 @@ DenseMap JumpThreadingPass::CloneInstructions(BasicBlock::iterator BI, BasicBlock::iterator BE, BasicBlock *NewBB, - BasicBlock *PredBB) { + BasicBlock *PredBB, + BasicBlock *SuccBB, + Value *PredVal) { // We are going to have to map operands from the source basic block to the new // copy of the block 'NewBB'. If there are PHI nodes in the source basic // block, evaluate them to account for entry from PredBB. @@ -1999,7 +2023,10 @@ // keeping track of the mapping and using it to remap operands in the cloned // instructions. for (; BI != BE; ++BI) { + if (SuccBB && BI->isTerminator()) + break; Instruction *New = BI->clone(); + assert(!BI->isTerminator() || isa(New)); New->setName(BI->getName()); NewBB->getInstList().push_back(New); ValueMapping[&*BI] = New; @@ -2011,6 +2038,10 @@ if (I != ValueMapping.end()) New->setOperand(i, I->second); } + if (ReturnInst *RetInst = dyn_cast(New)) { + assert(PredVal && !SuccBB); + New->replaceUsesOfWith(RetInst->getReturnValue(), PredVal); + } } return ValueMapping; @@ -2019,9 +2050,14 @@ /// ThreadEdge - We have decided that it is safe and profitable to factor the /// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB /// across BB. Transform the IR to reflect this change. +/// When SuccBB is nullptr, there is no edge threading to be done. Instead, this +/// method clones BB into the predecessor block to allow simplification of +/// of instructions inside BB. For this case (SuccBB == nullptr) currently, only +/// BB with return instruction is handled. FIXME: handle it for general case, and +/// make this a utility. bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, const SmallVectorImpl &PredBBs, - BasicBlock *SuccBB) { + BasicBlock *SuccBB, Value *PredVal) { // If threading to the same block as we come from, we would infinite loop. if (SuccBB == BB) { LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName() @@ -2062,10 +2098,14 @@ } // And finally, do it! - LLVM_DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() - << "' to '" << SuccBB->getName() - << "' with cost: " << JumpThreadCost - << ", across block:\n " << *BB << "\n"); + if (SuccBB) + LLVM_DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() + << "' to '" << SuccBB->getName() + << "' with cost: " << JumpThreadCost << "\n"); + else + LLVM_DEBUG(dbgs() << " Cloning BB '" << BB->getName() << "' into '" + << PredBB->getName() << "' with cost: " << JumpThreadCost + << ", across block:\n " << *BB << "\n"); if (DTU->hasPendingDomTreeUpdates()) LVI->disableDT(); @@ -2086,17 +2126,22 @@ } // Copy all the instructions from BB to NewBB except the terminator. + auto BE = BB->end(); + if (SuccBB) + BE = std::prev(BE); DenseMap ValueMapping = - CloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB); + CloneInstructions(BB->begin(), BE, NewBB, PredBB, + SuccBB, PredVal); // We didn't copy the terminator from BB over to NewBB, because there is now // an unconditional jump to SuccBB. Insert the unconditional jump. - BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB); - NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc()); - - // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the - // PHI nodes for NewBB now. - AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping); + BranchInst *NewBI = SuccBB ? BranchInst::Create(SuccBB, NewBB) : nullptr; + if (NewBI) { + NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc()); + // Check to see if SuccBB has PHI nodes. If so, we need to add entries to + // the PHI nodes for NewBB now. + AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping); + } // Update the terminator of PredBB to jump to NewBB instead of BB. This // eliminates predecessors from BB, which requires us to simplify any PHI @@ -2121,7 +2166,8 @@ SimplifyInstructionsInBlock(NewBB, TLI); // Update the edge weight from BB to SuccBB, which should be less than before. - UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB); + if (NewBI) + UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB); // Threaded an edge! ++NumThreads; @@ -2560,11 +2606,19 @@ if (LoopHeaders.count(BB)) return false; + auto IsConstOrAddr = [](Value *V) { + if (isa(V)) + return true; + V = V->stripInBoundsConstantOffsets(); + if (isa(V) || isa(V) || isa(V)) + return true; + return false; + }; + for (BasicBlock::iterator BI = BB->begin(); PHINode *PN = dyn_cast(BI); ++BI) { // Look for a Phi having at least one constant incoming value. - if (llvm::all_of(PN->incoming_values(), - [](Value *V) { return !isa(V); })) + if (llvm::none_of(PN->incoming_values(), IsConstOrAddr)) continue; auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) { @@ -2581,7 +2635,7 @@ // Look for a ICmp in BB that compares PN with a constant and is the // condition of a Select. if (Cmp->getParent() == BB && Cmp->hasOneUse() && - isa(Cmp->getOperand(1 - U.getOperandNo()))) + IsConstOrAddr(Cmp->getOperand(1 - U.getOperandNo()))) if (SelectInst *SelectI = dyn_cast(Cmp->user_back())) if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) { SI = SelectI; diff --git a/llvm/test/Transforms/JumpThreading/addr.ll b/llvm/test/Transforms/JumpThreading/addr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/addr.ll @@ -0,0 +1,79 @@ +; RUN: opt -jump-threading -S < %s | FileCheck %s +%"struct.std::array" = type { [3 x i32] } + +@r = dso_local local_unnamed_addr global i32 0, align 4 +@_ZL1x = internal constant %"struct.std::array" { [3 x i32] [i32 1, i32 7, i32 17] }, align 4 + +; Function Attrs: nounwind uwtable +define dso_local void @foo1(i32 %arg) local_unnamed_addr #0 { +bb: + switch i32 %arg, label %bb1 [ + i32 1, label %bb4 + i32 7, label %bb3 + ] + +bb1: ; preds = %bb + %tmp = icmp eq i32 %arg, 17 + %tmp2 = select i1 %tmp, i32* getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 0, i32 0, i64 2), i32* getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 1, i32 0, i64 0) + br label %bb4 + +bb3: ; preds = %bb + br label %bb4 + +bb4: ; preds = %bb3, %bb1, %bb + %tmp5 = phi i32* [ getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 0, i32 0, i64 0), %bb ], [ %tmp2, %bb1 ], [ getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 0, i32 0, i64 1), %bb3 ] +; CHECK: [[VAL:%.*]] = phi i32 [ 10,{{.*}}], [ 20, {{.*}}] + %tmp6 = icmp eq i32* %tmp5, getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 1, i32 0, i64 0) +; CHECK-NOT: icmp +; CHECK-NOT: select + %tmp7 = select i1 %tmp6, i32 20, i32 10 + store i32 %tmp7, i32* @r, align 4 +; CHECK: store i32 [[VAL]], i32* @r + ret void +} + +; Function Attrs: nounwind uwtable +define dso_local i32 @foo2(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = alloca [100 x i32], align 16 + %tmp1 = bitcast [100 x i32]* %tmp to i8* + call void @llvm.lifetime.start.p0i8(i64 400, i8* nonnull %tmp1) #2 + switch i32 %arg, label %bb2 [ + i32 1, label %bb6 + i32 7, label %bb5 + ] + +bb2: ; preds = %bb + %tmp3 = icmp eq i32 %arg, 17 + %tmp4 = select i1 %tmp3, i32* getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 0, i32 0, i64 2), i32* getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 1, i32 0, i64 0) + br label %bb6 + +bb5: ; preds = %bb + br label %bb6 + +bb6: ; preds = %bb5, %bb2, %bb + %tmp7 = phi i32* [ getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 0, i32 0, i64 0), %bb ], [ %tmp4, %bb2 ], [ getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 0, i32 0, i64 1), %bb5 ] +; CHECK: [[VAL:%.*]] = phi i32 [ 10,{{.*}}], [ 20, {{.*}}] + %tmp8 = icmp eq i32* %tmp7, getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 1, i32 0, i64 0) + %tmp9 = select i1 %tmp8, i32 20, i32 10 +; CHECK-NOT: select + store i32 %tmp9, i32* @r, align 4 +; CHECK: store i32 [[VAL]] + %tmp10 = zext i32 %tmp9 to i64 + %tmp11 = getelementptr inbounds [100 x i32], [100 x i32]* %tmp, i64 0, i64 %tmp10 + store i32 10, i32* %tmp11, align 8 + %tmp12 = getelementptr inbounds [100 x i32], [100 x i32]* %tmp, i64 0, i64 10 + %tmp13 = load i32, i32* %tmp12, align 8 + call void @llvm.lifetime.end.p0i8(i64 400, i8* nonnull %tmp1) #2 + ret i32 %tmp13 +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1 + +attributes #0 = { nounwind uwtable } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { nounwind } diff --git a/llvm/test/Transforms/JumpThreading/return.ll b/llvm/test/Transforms/JumpThreading/return.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/return.ll @@ -0,0 +1,103 @@ +; RUN: opt -jump-threading -S < %s | FileCheck %s +%"struct.std::array" = type { [3 x i32] } + +@_ZL1x = internal constant %"struct.std::array" { [3 x i32] [i32 1, i32 7, i32 17] }, align 4 +@g = dso_local global [10 x i32] zeroinitializer, align 16 + +; Function Attrs: norecurse nounwind readonly uwtable +define dso_local zeroext i1 @foo1(i32 %arg) local_unnamed_addr #0 { +bb: + switch i32 %arg, label %bb1 [ + i32 1, label %bb4 + i32 7, label %bb3 + ] + +bb1: ; preds = %bb + %tmp = icmp eq i32 %arg, 17 + %tmp2 = select i1 %tmp, i32* getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 0, i32 0, i64 2), i32* getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 1, i32 0, i64 0) + br label %bb4 +; CHECK: ret i1 true + +bb3: ; preds = %bb + br label %bb4 +; CHECK: ret i1 true + +bb4: ; preds = %bb3, %bb1, %bb + %tmp5 = phi i32* [ getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 0, i32 0, i64 0), %bb ], [ %tmp2, %bb1 ], [ getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 0, i32 0, i64 1), %bb3 ] + %tmp6 = icmp ne i32* %tmp5, getelementptr inbounds (%"struct.std::array", %"struct.std::array"* @_ZL1x, i64 1, i32 0, i64 0) + ret i1 %tmp6 +} + +; Function Attrs: norecurse nounwind readonly uwtable +define dso_local zeroext i1 @foo2(i32 %arg, i32* readnone %arg1) local_unnamed_addr #0 { +bb: + %tmp = icmp sgt i32 %arg, 5 + br i1 %tmp, label %bb8, label %bb2 +; CHECK: ret i1 true + +bb2: ; preds = %bb + %tmp3 = icmp sgt i32 %arg, 1 + br i1 %tmp3, label %bb8, label %bb4 +; CHECK: ret i1 false + +bb4: ; preds = %bb2 + %tmp5 = icmp eq i32 %arg, 1 + %tmp6 = getelementptr inbounds i32, i32* %arg1, i64 1 + %tmp7 = select i1 %tmp5, i32* %arg1, i32* %tmp6 + br label %bb8 + +bb8: ; preds = %bb4, %bb2, %bb + %tmp9 = phi i32* [ getelementptr inbounds ([10 x i32], [10 x i32]* @g, i64 0, i64 0), %bb ], [ getelementptr inbounds ([10 x i32], [10 x i32]* @g, i64 0, i64 1), %bb2 ], [ %tmp7, %bb4 ] + %tmp10 = icmp eq i32* %tmp9, getelementptr inbounds ([10 x i32], [10 x i32]* @g, i64 0, i64 0) + ret i1 %tmp10 +} + +define linkonce_odr hidden i1 @foo3() { +entry: + br label %land.rhs7 + +land.rhs7: ; preds = %entry + br i1 undef, label %land.rhs22, label %lor.lhs.false17 + +lor.lhs.false17: ; preds = %land.rhs7 + unreachable + +land.rhs22: ; preds = %land.rhs7 + %tobool30 = icmp ne i32 undef, 0 + ret i1 %tobool30 +; CHECK: ret i1 undef +} + +define linkonce_odr hidden i1 @foo4() { +entry: + %neg = and i32 undef, 2097152 + %tobool = icmp eq i32 %neg, 0 + br label %land.rhs7 + +land.rhs7: ; preds = %entry + br i1 %tobool, label %land.rhs22, label %lor.lhs.false17 +; CHECK: ret i1 false + +lor.lhs.false17: ; preds = %land.rhs7 + unreachable + +land.rhs22: ; preds = %land.rhs7 + %tobool30 = icmp ne i32 %neg, 0 + ret i1 %tobool30 +} + +define i1 @foo5() { +bb: + %l = load i8, i8* undef, align 8 + %i = icmp ne i8 %l, 0 + br i1 %i, label %t1, label %t2 +; CHECK: ret i1 false + +t1: ; preds = %bb + unreachable + +t2: ; preds = %bb + ret i1 %i +} + +attributes #0 = { norecurse nounwind readonly uwtable }