Index: lib/Transforms/Scalar/CallSiteSplitting.cpp =================================================================== --- lib/Transforms/Scalar/CallSiteSplitting.cpp +++ lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -64,6 +64,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -73,6 +74,12 @@ STATISTIC(NumCallSiteSplit, "Number of call-site split"); +/// Allow up to DuplicationLimit instructions before call instructions. +/// Those instructions need to be duplicated in all split blocks. +static cl::opt + DuplicationLimit("callsite-splitting-duplication-limit", cl::init(5), + cl::Hidden); + static void addNonNullAttribute(CallSite CS, Value *Op) { unsigned ArgNo = 0; for (auto &I : CS.args()) { @@ -174,12 +181,13 @@ if (!isa(Instr)) return false; - // Allow splitting a call-site only when there is no instruction before the - // call-site in the basic block. Based on this constraint, we only clone the - // call instruction, and we do not move a call-site across any other - // instruction. + // Allow splitting a call-site only when there are less than + // DuplicationLimit instructions before the call. The instructions + // before the call will be duplicated in the split blocks and corresponding + // uses will be updated. BasicBlock *CallSiteBB = Instr->getParent(); - if (Instr != CallSiteBB->getFirstNonPHIOrDbg()) + if (std::distance(CallSiteBB->getFirstNonPHIOrDbg()->getIterator(), + Instr->getIterator()) >= DuplicationLimit) return false; // Need 2 predecessors and cannot split an edge from an IndirectBrInst. @@ -245,16 +253,19 @@ CallPN = PHINode::Create(Instr->getType(), Preds.size(), "phi.call"); DEBUG(dbgs() << "split call-site : " << *Instr << " into \n"); + SmallVector, BasicBlock *>, 2> + Remaps; for (const auto &P : Preds) { BasicBlock *PredBB = P.first; - BasicBlock *SplitBlock = - SplitBlockPredecessors(TailBB, PredBB, ".predBB.split"); + ValueToValueMapTy Remap; + BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween( + TailBB, PredBB, &*std::next(Instr->getIterator()), Remap); assert(SplitBlock && "Unexpected new basic block split."); - Instruction *NewCI = Instr->clone(); + Instruction *NewCI = + &*std::prev(SplitBlock->getTerminator()->getIterator()); CallSite NewCS(NewCI); addConditions(NewCS, P.second); - NewCI->insertBefore(&*SplitBlock->getFirstInsertionPt()); // Handle PHIs used as arguments in the call-site. for (PHINode &PN : TailBB->phis()) { @@ -270,15 +281,46 @@ << "\n"); if (CallPN) CallPN->addIncoming(NewCI, SplitBlock); + + // ValueToValueMapTy is not copy-able, but we need the mapping information + // for all split blocks later on, to insert PHI nodes. So we copy the + // mapping into a DenseMap for later use. We do not modify any mapped + // instructions until we use the collected maps to delete them. + DenseMap Mapping; + for (const auto &KV : Remap) + Mapping[KV.first] = KV.second; + Remaps.push_back({Mapping, SplitBlock}); } + auto OriginalBegin = TailBB->begin(); // Replace users of the original call with a PHI mering call-sites split. if (CallPN) { - CallPN->insertBefore(TailBB->getFirstNonPHI()); + CallPN->insertBefore(&*TailBB->begin()); Instr->replaceAllUsesWith(CallPN); } - Instr->eraseFromParent(); + // Remove instructions moved to split blocks from TailBB, from the duplicated + // call instruction to the beginning of the basic block. If an instruction + // has any uses, add a new PHI node to combine the values coming from the + // split blocks. The new PHI nodes are placed before the first original + // instruction, so we do not end up deleting them. By using reverse-order, we + // do not introduce unnecessary PHI nodes for def-use chains from the call + // instruction to the beginning of the block. + auto I = Instr->getReverseIterator(); + while (I != TailBB->rend()) { + Instruction *CurrentI = &*I++; + if (!CurrentI->use_empty()) { + PHINode *NewPN = PHINode::Create(CurrentI->getType(), 2); + for (auto &Mapping : Remaps) + NewPN->addIncoming(Mapping.first[CurrentI], Mapping.second); + NewPN->insertBefore(&*TailBB->begin()); + CurrentI->replaceAllUsesWith(NewPN); + } + CurrentI->eraseFromParent(); + // We are done once we handled the first original instruction in TailBB. + if (CurrentI == &*OriginalBegin) + break; + } NumCallSiteSplit++; } Index: test/Transforms/CallSiteSplitting/callsite-instructions-before-call.ll =================================================================== --- /dev/null +++ test/Transforms/CallSiteSplitting/callsite-instructions-before-call.ll @@ -0,0 +1,175 @@ +; RUN: opt -S -callsite-splitting < %s | FileCheck %s + +; Instructions before a call that will be pushed to its predecessors +; with uses after the callsite, must be patched up as PHI nodes in +; the join block. +define i32* @test_split_branch_phi(i32* %ptrarg, i32 %i) { +Header: + %tobool = icmp ne i32* %ptrarg, null + br i1 %tobool, label %TBB, label %CallSite + +TBB: ; preds = %Header + %arrayidx = getelementptr inbounds i32, i32* %ptrarg, i64 42 + %0 = load i32, i32* %arrayidx, align 4 + %tobool1 = icmp ne i32 %0, 0 + br i1 %tobool1, label %CallSite, label %End + +CallSite: ; preds = %TBB, %Header + %somepointer = getelementptr i32, i32* %ptrarg, i64 18 + call void @bar(i32* %ptrarg, i32 %i) + br label %End + +End: ; preds = %CallSite, %TBB + %somepointerphi = phi i32* [ %somepointer, %CallSite ], [ null, %TBB ] + ret i32* %somepointerphi +} +; CHECK-LABEL: test_split_branch_phi +; CHECK-LABEL: Header.split +; CHECK: %[[V1:somepointer[0-9]+]] = getelementptr i32, i32* %ptrarg, i64 18 +; CHECK: call void @bar(i32* null, i32 %i) +; CHECK: br label %CallSite +; CHECK-LABEL: TBB.split: +; CHECK: %[[V2:somepointer[0-9]+]] = getelementptr i32, i32* %ptrarg, i64 18 +; CHECK: call void @bar(i32* nonnull %ptrarg, i32 %i) +; CHECK: br label %CallSite +; CHECK: CallSite: +; CHECK: phi i32* [ %[[V1]], %Header.split ], [ %[[V2]], %TBB.split ] + + +define void @split_branch_no_extra_phi(i32* %ptrarg, i32 %i) { +Header: + %tobool = icmp ne i32* %ptrarg, null + br i1 %tobool, label %TBB, label %CallSite + +TBB: ; preds = %Header + %arrayidx = getelementptr inbounds i32, i32* %ptrarg, i64 42 + %0 = load i32, i32* %arrayidx, align 4 + %tobool1 = icmp ne i32 %0, 0 + br i1 %tobool1, label %CallSite, label %End + +CallSite: ; preds = %TBB, %Header + %i.add = add i32 %i, 99 + call void @bar(i32* %ptrarg, i32 %i.add) + br label %End + +End: ; preds = %CallSite, %TBB + ret void +} +; CHECK-LABEL: split_branch_no_extra_phi +; CHECK-LABEL: Header.split +; CHECK: %[[V1:.+]] = add i32 %i, 99 +; CHECK: call void @bar(i32* null, i32 %[[V1]]) +; CHECK: br label %CallSite +; CHECK-LABEL: TBB.split: +; CHECK: %[[V2:.+]] = add i32 %i, 99 +; CHECK: call void @bar(i32* nonnull %ptrarg, i32 %[[V2]]) +; CHECK: br label %CallSite +; CHECK: CallSite: +; CHECK-NOT: phi + + +; In this test case, DuplicationThreshold (5) instructions are before +; the call site. +define void @test_no_split_threshold(i32* %ptrarg, i32 %i) { +Header: + %tobool = icmp ne i32* %ptrarg, null + br i1 %tobool, label %TBB, label %CallSite + +TBB: ; preds = %Header + %arrayidx = getelementptr inbounds i32, i32* %ptrarg, i64 42 + %0 = load i32, i32* %arrayidx, align 4 + %tobool1 = icmp ne i32 %0, 0 + br i1 %tobool1, label %CallSite, label %End + +CallSite: ; preds = %TBB, %Header + %idx = getelementptr inbounds i32, i32* %ptrarg, i64 32 + %v1 = load i32, i32* %idx, align 4 + %a1 = add i32 %i, %v1 + %a2 = mul i32 %a1, %v1 + %a3 = sdiv i32 %a2, 99 + call void @bar(i32* %ptrarg, i32 %a3) + store i32 %a3, i32* %idx, align 4 + br label %End + +End: ; preds = %CallSite, %TBB + ret void +} +; CHECK-LABEL: test_no_split_threshold +; CHECK-NOT: split +; CHECK-LABEL: CallSite: +; CHECK: call void @bar(i32* %ptrarg, i32 %a3) + +; In this test case, the phi node %l in CallSite should be removed, as after +; moving the call to the split blocks we can use the values directly. +define void @test_remove_unused_phi(i32* %ptrarg, i32 %i) { +Header: + %l1 = load i32, i32* undef, align 16 + %tobool = icmp ne i32* %ptrarg, null + br i1 %tobool, label %TBB, label %CallSite + +TBB: ; preds = %Header + %arrayidx = getelementptr inbounds i32, i32* %ptrarg, i64 42 + %0 = load i32, i32* %arrayidx, align 4 + %l2 = load i32, i32* undef, align 16 + %tobool1 = icmp ne i32 %0, 0 + br i1 %tobool1, label %CallSite, label %End + +CallSite: ; preds = %TBB, %Header + %l = phi i32 [ %l1, %Header ], [ %l2, %TBB ] + call void @bar(i32* %ptrarg, i32 %l) + br label %End + +End: ; preds = %CallSite, %TBB + ret void +} +; CHECK-LABEL: test_remove_unused_phi +; CHECK-LABEL: Header.split +; CHECK: call void @bar(i32* null, i32 %l1) +; CHECK: br label %CallSite +; CHECK-LABEL: TBB.split: +; CHECK: call void @bar(i32* nonnull %ptrarg, i32 %l2) +; CHECK: br label %CallSite +; CHECK-LABEL: CallSite: +; CHECK-NOT: phi + +; In this test case, we need to insert a new PHI node in TailBB to combine +; the loads we moved to the predecessors. +define void @test_add_new_phi(i32* %ptrarg, i32 %i) { +Header: + %tobool = icmp ne i32* %ptrarg, null + br i1 %tobool, label %TBB, label %CallSite + +TBB: + br i1 undef, label %CallSite, label %End + +CallSite: + %arrayidx112 = getelementptr inbounds i32, i32* undef, i64 1 + %0 = load i32, i32* %arrayidx112, align 4 + call void @bar(i32* %ptrarg, i32 %i) + %sub = sub nsw i32 %0, undef + br label %End + +End: ; preds = %CallSite, %TBB + ret void +} +; CHECK-LABEL: test_add_new_phi +; CHECK-LABEL: TBB1.split +; CHECK: %[[V1:.+]] = load i32, i32* +; CHECK: call void @bar(i32* null, i32 %0) +; CHECK: br label %CallSite +; CHECK-LABEL: TBB2.split: +; CHECK: %[[V2:.+]] = load i32, i32* +; CHECK: call void @bar(i32* nonnull %ptrarg, i32 %0) +; CHECK: br label %CallSite +; CHECK-LABEL: CallSite: +; CHECK-NEXT: %[[V3:.+]] = phi i32 [ %[[V1]], %TBB1.split ], [ %[[V2]], %TBB2.split ] +; CHECK: %sub = sub nsw i32 %[[V3]], undef + + +define void @bar(i32*, i32) { + ret void +} + +define void @bari(i32) { + ret void +} Index: test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll =================================================================== --- test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll +++ test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll @@ -3,15 +3,15 @@ ; CHECK-LABEL: @test_simple ; CHECK-LABEL: Header: -; CHECK-NEXT: br i1 undef, label %Tail.predBB.split -; CHECK-LABEL: TBB: -; CHECK: br i1 %cmp, label %Tail.predBB.split1 -; CHECK-LABEL: Tail.predBB.split: +; CHECK-NEXT: br i1 undef, label %Header.split +; CHECK-LABEL: Header.split: ; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 %p) -; CHECK-LABEL: Tail.predBB.split1: +; CHECK-LABEL: TBB: +; CHECK: br i1 %cmp, label %TBB.split +; CHECK-LABEL: TBB.split: ; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 %p) ; CHECK-LABEL: Tail -; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB.split ], [ %[[CALL2]], %Tail.predBB.split1 ] +; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header.split ], [ %[[CALL2]], %TBB.split ] ; CHECK: ret i32 %[[MERGED]] define i32 @test_simple(i32* %a, i32 %v, i32 %p) { Header: @@ -31,15 +31,15 @@ ; CHECK-LABEL: @test_eq_eq_eq_untaken ; CHECK-LABEL: Header: -; CHECK: br i1 %tobool1, label %TBB1, label %Tail.predBB.split -; CHECK-LABEL: TBB2: -; CHECK: br i1 %cmp2, label %Tail.predBB.split1, label %End -; CHECK-LABEL: Tail.predBB.split: +; CHECK: br i1 %tobool1, label %TBB1, label %Header.split +; CHECK-LABEL: Header.split: ; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p) -; CHECK-LABEL: Tail.predBB.split1: +; CHECK-LABEL: TBB2: +; CHECK: br i1 %cmp2, label %TBB2.split, label %End +; CHECK-LABEL: TBB2.split: ; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 99) ; CHECK-LABEL: Tail -; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB.split ], [ %[[CALL2]], %Tail.predBB.split1 ] +; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header.split ], [ %[[CALL2]], %TBB2.split ] ; CHECK: ret i32 %[[MERGED]] define i32 @test_eq_eq_eq_untaken2(i32* %a, i32 %v, i32 %p) { Header: @@ -64,15 +64,15 @@ ; CHECK-LABEL: @test_eq_ne_eq_untaken ; CHECK-LABEL: Header: -; CHECK: br i1 %tobool1, label %TBB1, label %Tail.predBB.split -; CHECK-LABEL: TBB2: -; CHECK: br i1 %cmp2, label %Tail.predBB.split1, label %End -; CHECK-LABEL: Tail.predBB.split: +; CHECK: br i1 %tobool1, label %TBB1, label %Header.split +; CHECK-LABEL: Header.split: ; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p) -; CHECK-LABEL: Tail.predBB.split1: +; CHECK-LABEL: TBB2: +; CHECK: br i1 %cmp2, label %TBB2.split, label %End +; CHECK-LABEL: TBB2.split: ; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 99) ; CHECK-LABEL: Tail -; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB.split ], [ %[[CALL2]], %Tail.predBB.split1 ] +; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header.split ], [ %[[CALL2]], %TBB2.split ] ; CHECK: ret i32 %[[MERGED]] define i32 @test_eq_ne_eq_untaken(i32* %a, i32 %v, i32 %p) { Header: @@ -97,17 +97,17 @@ ; CHECK-LABEL: @test_header_header2_tbb ; CHECK: Header2: -; CHECK:br i1 %tobool2, label %Tail.predBB.split, label %TBB1 -; CHECK-LABEL: TBB2: -; CHECK: br i1 %cmp2, label %Tail.predBB.split1, label %End -; CHECK-LABEL: Tail.predBB.split: +; CHECK:br i1 %tobool2, label %Header2.split, label %TBB1 +; CHECK-LABEL: Header2.split: ; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 10) -; CHECK-LABEL: Tail.predBB.split1: +; CHECK-LABEL: TBB2: +; CHECK: br i1 %cmp2, label %TBB2.split, label %End +; CHECK-LABEL: TBB2.split: ; NOTE: CallSiteSplitting cannot infer that %a is null here, as it currently ; only supports recording conditions along a single predecessor path. ; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 99) ; CHECK-LABEL: Tail -; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB.split ], [ %[[CALL2]], %Tail.predBB.split1 ] +; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header2.split ], [ %[[CALL2]], %TBB2.split ] ; CHECK: ret i32 %[[MERGED]] define i32 @test_header_header2_tbb(i32* %a, i32 %v, i32 %p) { Header: Index: test/Transforms/CallSiteSplitting/callsite-split-debug.ll =================================================================== --- test/Transforms/CallSiteSplitting/callsite-split-debug.ll +++ test/Transforms/CallSiteSplitting/callsite-split-debug.ll @@ -48,10 +48,9 @@ ; CallSiteBB. ; CHECK-LABEL: @foo -; CHECK-LABEL: CallsiteBB.predBB.split: -; CHECK: [[TMP1:%[0-9]+]] = call i16 @bar(i16 1, i16 5) -; CHECK-LABEL: CallsiteBB.predBB.split1: -; CHECK: [[TMP2:%[0-9]+]] = call i16 @bar(i16 0, i16 5) +; CHECK-LABEL: bb1.split: +; CHECK: [[TMP1:%[0-9]+]] = call i16 @bar(i16 0, i16 5) +; CHECK-LABEL: bb2.split: +; CHECK: [[TMP2:%[0-9]+]] = call i16 @bar(i16 1, i16 5) ; CHECK-LABEL: CallsiteBB -; CHECK: %phi.call = phi i16 [ [[TMP1]], %CallsiteBB.predBB.split ], [ [[TMP2]], %CallsiteBB.predBB.split1 - +; CHECK: %phi.call = phi i16 [ [[TMP2]], %bb2.split ], [ [[TMP1]], %bb1.split Index: test/Transforms/CallSiteSplitting/callsite-split.ll =================================================================== --- test/Transforms/CallSiteSplitting/callsite-split.ll +++ test/Transforms/CallSiteSplitting/callsite-split.ll @@ -9,7 +9,7 @@ ;CHECK-LABEL: @caller ;CHECK-LABEL: NextCond: ;CHECK: br {{.*}} label %callee.exit -;CHECK-LABEL: CallSiteBB.predBB.split: +;CHECK-LABEL: NextCond.split: ;CHECK: call void @callee(%struct.bitmap* null, %struct.bitmap* null, %struct.bitmap* %b_elt, i1 false) ;CHECK-LABEL: callee.exit: ;CHECK: call void @dummy2(%struct.bitmap* %a_elt)