Index: lib/Transforms/Scalar/CallSiteSplitting.cpp
===================================================================
--- lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -64,6 +64,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
@@ -73,6 +74,12 @@
 
 STATISTIC(NumCallSiteSplit, "Number of call-site split");
 
+/// Allow up to DuplicationLimit instructions before call instructions.
+/// Those instructions need to be duplicated in all split blocks.
+static cl::opt<unsigned>
+    DuplicationLimit("callsite-splitting-duplication-limit", cl::init(5),
+                     cl::Hidden);
+
 static void addNonNullAttribute(CallSite CS, Value *Op) {
   unsigned ArgNo = 0;
   for (auto &I : CS.args()) {
@@ -174,12 +181,13 @@
   if (!isa<CallInst>(Instr))
     return false;
 
-  // Allow splitting a call-site only when there is no instruction before the
-  // call-site in the basic block. Based on this constraint, we only clone the
-  // call instruction, and we do not move a call-site across any other
-  // instruction.
+  // Allow splitting a call-site only when there are less than
+  // DuplicationLimit instructions before the call. The instructions
+  // before the call will be duplicated in the split blocks and corresponding
+  // uses will be updated.
   BasicBlock *CallSiteBB = Instr->getParent();
-  if (Instr != CallSiteBB->getFirstNonPHIOrDbg())
+  if (std::distance(CallSiteBB->getFirstNonPHIOrDbg()->getIterator(),
+                    Instr->getIterator()) >= DuplicationLimit)
     return false;
 
   // Need 2 predecessors and cannot split an edge from an IndirectBrInst.
@@ -245,16 +253,19 @@
     CallPN = PHINode::Create(Instr->getType(), Preds.size(), "phi.call");
 
   DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
+  SmallVector<std::pair<DenseMap<const Value *, Value *>, BasicBlock *>, 2>
+      Remaps;
   for (const auto &P : Preds) {
     BasicBlock *PredBB = P.first;
-    BasicBlock *SplitBlock =
-        SplitBlockPredecessors(TailBB, PredBB, ".predBB.split");
+    ValueToValueMapTy Remap;
+    BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween(
+        TailBB, PredBB, &*std::next(Instr->getIterator()), Remap);
     assert(SplitBlock && "Unexpected new basic block split.");
 
-    Instruction *NewCI = Instr->clone();
+    Instruction *NewCI =
+        &*std::prev(SplitBlock->getTerminator()->getIterator());
     CallSite NewCS(NewCI);
     addConditions(NewCS, P.second);
-    NewCI->insertBefore(&*SplitBlock->getFirstInsertionPt());
 
     // Handle PHIs used as arguments in the call-site.
     for (PHINode &PN : TailBB->phis()) {
@@ -270,15 +281,46 @@
                  << "\n");
     if (CallPN)
       CallPN->addIncoming(NewCI, SplitBlock);
+
+    // ValueToValueMapTy is not copy-able, but we need the mapping information
+    // for all split blocks later on, to insert PHI nodes. So we copy the
+    // mapping into a DenseMap for later use. We do not modify any mapped
+    // instructions until we use the collected maps to delete them.
+    DenseMap<const Value *, Value *> Mapping;
+    for (const auto &KV : Remap)
+      Mapping[KV.first] = KV.second;
+    Remaps.push_back({Mapping, SplitBlock});
   }
 
+  auto OriginalBegin = TailBB->begin();
   // Replace users of the original call with a PHI mering call-sites split.
   if (CallPN) {
-    CallPN->insertBefore(TailBB->getFirstNonPHI());
+    CallPN->insertBefore(&*TailBB->begin());
     Instr->replaceAllUsesWith(CallPN);
   }
 
-  Instr->eraseFromParent();
+  // Remove instructions moved to split blocks from TailBB, from the duplicated
+  // call instruction to the beginning of the basic block. If an instruction
+  // has any uses, add a new PHI node to combine the values coming from the
+  // split blocks. The new PHI nodes are placed before the first original
+  // instruction, so we do not end up deleting them. By using reverse-order, we
+  // do not introduce unnecessary PHI nodes for def-use chains from the call
+  // instruction to the beginning of the block.
+  auto I = Instr->getReverseIterator();
+  while (I != TailBB->rend()) {
+    Instruction *CurrentI = &*I++;
+    if (!CurrentI->use_empty()) {
+      PHINode *NewPN = PHINode::Create(CurrentI->getType(), 2);
+      for (auto &Mapping : Remaps)
+        NewPN->addIncoming(Mapping.first[CurrentI], Mapping.second);
+      NewPN->insertBefore(&*TailBB->begin());
+      CurrentI->replaceAllUsesWith(NewPN);
+    }
+    CurrentI->eraseFromParent();
+    // We are done once we handled the first original instruction in TailBB.
+    if (CurrentI == &*OriginalBegin)
+      break;
+  }
   NumCallSiteSplit++;
 }
 
Index: test/Transforms/CallSiteSplitting/callsite-instructions-before-call.ll
===================================================================
--- /dev/null
+++ test/Transforms/CallSiteSplitting/callsite-instructions-before-call.ll
@@ -0,0 +1,175 @@
+; RUN: opt -S  -callsite-splitting < %s | FileCheck %s
+
+; Instructions before a call that will be pushed to its predecessors
+; with uses after the callsite, must be patched up as PHI nodes in
+; the join block.
+define i32* @test_split_branch_phi(i32* %ptrarg, i32 %i) {
+Header:
+  %tobool = icmp ne i32* %ptrarg, null
+  br i1 %tobool, label %TBB, label %CallSite
+
+TBB:                                    ; preds = %Header
+  %arrayidx = getelementptr inbounds i32, i32* %ptrarg, i64 42
+  %0 = load i32, i32* %arrayidx, align 4
+  %tobool1 = icmp ne i32 %0, 0
+  br i1 %tobool1, label %CallSite, label %End
+
+CallSite:                                          ; preds = %TBB, %Header
+  %somepointer = getelementptr i32, i32* %ptrarg, i64 18
+  call void @bar(i32* %ptrarg, i32 %i)
+  br label %End
+
+End:                                           ; preds = %CallSite, %TBB
+  %somepointerphi = phi i32* [ %somepointer, %CallSite ], [ null, %TBB ]
+  ret i32* %somepointerphi
+}
+; CHECK-LABEL: test_split_branch_phi
+; CHECK-LABEL: Header.split
+; CHECK: %[[V1:somepointer[0-9]+]] = getelementptr i32, i32* %ptrarg, i64 18
+; CHECK: call void @bar(i32* null, i32 %i)
+; CHECK: br label %CallSite
+; CHECK-LABEL: TBB.split:
+; CHECK: %[[V2:somepointer[0-9]+]] = getelementptr i32, i32* %ptrarg, i64 18
+; CHECK: call void @bar(i32* nonnull %ptrarg, i32 %i)
+; CHECK: br label %CallSite
+; CHECK: CallSite:
+; CHECK: phi i32* [ %[[V1]], %Header.split ], [ %[[V2]], %TBB.split ]
+
+
+define void @split_branch_no_extra_phi(i32* %ptrarg, i32 %i) {
+Header:
+  %tobool = icmp ne i32* %ptrarg, null
+  br i1 %tobool, label %TBB, label %CallSite
+
+TBB:                                    ; preds = %Header
+  %arrayidx = getelementptr inbounds i32, i32* %ptrarg, i64 42
+  %0 = load i32, i32* %arrayidx, align 4
+  %tobool1 = icmp ne i32 %0, 0
+  br i1 %tobool1, label %CallSite, label %End
+
+CallSite:                                          ; preds = %TBB, %Header
+  %i.add = add i32 %i, 99
+  call void @bar(i32* %ptrarg, i32 %i.add)
+  br label %End
+
+End:                                           ; preds = %CallSite, %TBB
+  ret void
+}
+; CHECK-LABEL: split_branch_no_extra_phi
+; CHECK-LABEL: Header.split
+; CHECK: %[[V1:.+]] = add i32 %i, 99
+; CHECK: call void @bar(i32* null, i32 %[[V1]])
+; CHECK: br label %CallSite
+; CHECK-LABEL: TBB.split:
+; CHECK: %[[V2:.+]] = add i32 %i, 99
+; CHECK: call void @bar(i32* nonnull %ptrarg, i32 %[[V2]])
+; CHECK: br label %CallSite
+; CHECK: CallSite:
+; CHECK-NOT: phi
+
+
+; In this test case, DuplicationThreshold (5) instructions are before
+; the call site.
+define void @test_no_split_threshold(i32* %ptrarg, i32 %i) {
+Header:
+  %tobool = icmp ne i32* %ptrarg, null
+  br i1 %tobool, label %TBB, label %CallSite
+
+TBB:                                    ; preds = %Header
+  %arrayidx = getelementptr inbounds i32, i32* %ptrarg, i64 42
+  %0 = load i32, i32* %arrayidx, align 4
+  %tobool1 = icmp ne i32 %0, 0
+  br i1 %tobool1, label %CallSite, label %End
+
+CallSite:                                          ; preds = %TBB, %Header
+  %idx = getelementptr inbounds i32, i32* %ptrarg, i64 32
+  %v1 = load i32, i32* %idx, align 4
+  %a1 = add i32 %i, %v1
+  %a2 = mul i32 %a1, %v1
+  %a3 = sdiv i32 %a2, 99
+  call void @bar(i32* %ptrarg, i32 %a3)
+  store i32 %a3, i32* %idx, align 4
+  br label %End
+
+End:                                           ; preds = %CallSite, %TBB
+  ret void
+}
+; CHECK-LABEL: test_no_split_threshold
+; CHECK-NOT: split
+; CHECK-LABEL: CallSite:
+; CHECK: call void @bar(i32* %ptrarg, i32 %a3)
+
+; In this test case, the phi node %l in CallSite should be removed, as after
+; moving the call to the split blocks we can use the values directly.
+define void @test_remove_unused_phi(i32* %ptrarg, i32 %i) {
+Header:
+  %l1 = load i32, i32* undef, align 16
+  %tobool = icmp ne i32* %ptrarg, null
+  br i1 %tobool, label %TBB, label %CallSite
+
+TBB:                                    ; preds = %Header
+  %arrayidx = getelementptr inbounds i32, i32* %ptrarg, i64 42
+  %0 = load i32, i32* %arrayidx, align 4
+  %l2 = load i32, i32* undef, align 16
+  %tobool1 = icmp ne i32 %0, 0
+  br i1 %tobool1, label %CallSite, label %End
+
+CallSite:                                          ; preds = %TBB, %Header
+  %l = phi i32 [ %l1, %Header ], [ %l2, %TBB ]
+  call void @bar(i32* %ptrarg, i32 %l)
+  br label %End
+
+End:                                           ; preds = %CallSite, %TBB
+  ret void
+}
+; CHECK-LABEL: test_remove_unused_phi
+; CHECK-LABEL: Header.split
+; CHECK: call void @bar(i32* null, i32 %l1)
+; CHECK: br label %CallSite
+; CHECK-LABEL: TBB.split:
+; CHECK: call void @bar(i32* nonnull %ptrarg, i32 %l2)
+; CHECK: br label %CallSite
+; CHECK-LABEL: CallSite:
+; CHECK-NOT: phi
+
+; In this test case, we need to insert a new PHI node in TailBB to combine
+; the loads we moved to the predecessors.
+define void @test_add_new_phi(i32* %ptrarg, i32 %i) {
+Header:
+  %tobool = icmp ne i32* %ptrarg, null
+  br i1 %tobool, label %TBB, label %CallSite
+
+TBB:
+  br i1 undef, label %CallSite, label %End
+
+CallSite:
+  %arrayidx112 = getelementptr inbounds i32, i32* undef, i64 1
+  %0 = load i32, i32* %arrayidx112, align 4
+  call void @bar(i32* %ptrarg, i32 %i)
+  %sub = sub nsw i32 %0, undef
+  br label %End
+
+End:                                           ; preds = %CallSite, %TBB
+  ret void
+}
+; CHECK-LABEL: test_add_new_phi
+; CHECK-LABEL: TBB1.split
+; CHECK: %[[V1:.+]] = load i32, i32*
+; CHECK: call void @bar(i32* null, i32 %0)
+; CHECK: br label %CallSite
+; CHECK-LABEL: TBB2.split:
+; CHECK: %[[V2:.+]] = load i32, i32*
+; CHECK: call void @bar(i32* nonnull %ptrarg, i32 %0)
+; CHECK: br label %CallSite
+; CHECK-LABEL: CallSite:
+; CHECK-NEXT: %[[V3:.+]] = phi i32 [ %[[V1]], %TBB1.split ], [ %[[V2]], %TBB2.split ]
+; CHECK: %sub = sub nsw i32 %[[V3]], undef
+
+
+define void @bar(i32*, i32) {
+    ret void
+}
+
+define  void @bari(i32) {
+    ret void
+}
Index: test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll
===================================================================
--- test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll
+++ test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll
@@ -3,15 +3,15 @@
 
 ; CHECK-LABEL: @test_simple
 ; CHECK-LABEL: Header:
-; CHECK-NEXT: br i1 undef, label %Tail.predBB.split
-; CHECK-LABEL: TBB:
-; CHECK: br i1 %cmp, label %Tail.predBB.split1
-; CHECK-LABEL: Tail.predBB.split:
+; CHECK-NEXT: br i1 undef, label %Header.split
+; CHECK-LABEL: Header.split:
 ; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 %p)
-; CHECK-LABEL: Tail.predBB.split1:
+; CHECK-LABEL: TBB:
+; CHECK: br i1 %cmp, label %TBB.split
+; CHECK-LABEL: TBB.split:
 ; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 %p)
 ; CHECK-LABEL: Tail
-; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB.split ], [ %[[CALL2]], %Tail.predBB.split1 ]
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header.split ], [ %[[CALL2]], %TBB.split ]
 ; CHECK: ret i32 %[[MERGED]]
 define i32 @test_simple(i32* %a, i32 %v, i32 %p) {
 Header:
@@ -31,15 +31,15 @@
 
 ; CHECK-LABEL: @test_eq_eq_eq_untaken
 ; CHECK-LABEL: Header:
-; CHECK: br i1 %tobool1, label %TBB1, label %Tail.predBB.split
-; CHECK-LABEL: TBB2:
-; CHECK: br i1 %cmp2, label %Tail.predBB.split1, label %End
-; CHECK-LABEL: Tail.predBB.split:
+; CHECK: br i1 %tobool1, label %TBB1, label %Header.split
+; CHECK-LABEL: Header.split:
 ; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p)
-; CHECK-LABEL: Tail.predBB.split1:
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %TBB2.split, label %End
+; CHECK-LABEL: TBB2.split:
 ; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 99)
 ; CHECK-LABEL: Tail
-; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB.split ], [ %[[CALL2]], %Tail.predBB.split1 ]
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header.split ], [ %[[CALL2]], %TBB2.split ]
 ; CHECK: ret i32 %[[MERGED]]
 define i32 @test_eq_eq_eq_untaken2(i32* %a, i32 %v, i32 %p) {
 Header:
@@ -64,15 +64,15 @@
 
 ; CHECK-LABEL: @test_eq_ne_eq_untaken
 ; CHECK-LABEL: Header:
-; CHECK: br i1 %tobool1, label %TBB1, label %Tail.predBB.split
-; CHECK-LABEL: TBB2:
-; CHECK: br i1 %cmp2, label %Tail.predBB.split1, label %End
-; CHECK-LABEL: Tail.predBB.split:
+; CHECK: br i1 %tobool1, label %TBB1, label %Header.split
+; CHECK-LABEL: Header.split:
 ; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p)
-; CHECK-LABEL: Tail.predBB.split1:
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %TBB2.split, label %End
+; CHECK-LABEL: TBB2.split:
 ; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 99)
 ; CHECK-LABEL: Tail
-; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB.split ], [ %[[CALL2]], %Tail.predBB.split1 ]
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header.split ], [ %[[CALL2]], %TBB2.split ]
 ; CHECK: ret i32 %[[MERGED]]
 define i32 @test_eq_ne_eq_untaken(i32* %a, i32 %v, i32 %p) {
 Header:
@@ -97,17 +97,17 @@
 
 ; CHECK-LABEL: @test_header_header2_tbb
 ; CHECK: Header2:
-; CHECK:br i1 %tobool2, label %Tail.predBB.split, label %TBB1
-; CHECK-LABEL: TBB2:
-; CHECK: br i1 %cmp2, label %Tail.predBB.split1, label %End
-; CHECK-LABEL: Tail.predBB.split:
+; CHECK:br i1 %tobool2, label %Header2.split, label %TBB1
+; CHECK-LABEL: Header2.split:
 ; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 10)
-; CHECK-LABEL: Tail.predBB.split1:
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %TBB2.split, label %End
+; CHECK-LABEL: TBB2.split:
 ; NOTE: CallSiteSplitting cannot infer that %a is null here, as it currently
 ;       only supports recording conditions along a single predecessor path.
 ; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 99)
 ; CHECK-LABEL: Tail
-; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB.split ], [ %[[CALL2]], %Tail.predBB.split1 ]
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header2.split ], [ %[[CALL2]], %TBB2.split ]
 ; CHECK: ret i32 %[[MERGED]]
 define i32 @test_header_header2_tbb(i32* %a, i32 %v, i32 %p) {
 Header:
Index: test/Transforms/CallSiteSplitting/callsite-split-debug.ll
===================================================================
--- test/Transforms/CallSiteSplitting/callsite-split-debug.ll
+++ test/Transforms/CallSiteSplitting/callsite-split-debug.ll
@@ -48,10 +48,9 @@
 ; CallSiteBB.
 
 ; CHECK-LABEL: @foo
-; CHECK-LABEL: CallsiteBB.predBB.split:
-; CHECK: [[TMP1:%[0-9]+]] = call i16 @bar(i16 1, i16 5)
-; CHECK-LABEL: CallsiteBB.predBB.split1:
-; CHECK: [[TMP2:%[0-9]+]] = call i16 @bar(i16 0, i16 5)
+; CHECK-LABEL: bb1.split:
+; CHECK: [[TMP1:%[0-9]+]] = call i16 @bar(i16 0, i16 5)
+; CHECK-LABEL: bb2.split:
+; CHECK: [[TMP2:%[0-9]+]] = call i16 @bar(i16 1, i16 5)
 ; CHECK-LABEL: CallsiteBB
-; CHECK: %phi.call = phi i16 [ [[TMP1]], %CallsiteBB.predBB.split ], [ [[TMP2]], %CallsiteBB.predBB.split1
-
+; CHECK: %phi.call = phi i16 [ [[TMP2]], %bb2.split ], [ [[TMP1]], %bb1.split
Index: test/Transforms/CallSiteSplitting/callsite-split.ll
===================================================================
--- test/Transforms/CallSiteSplitting/callsite-split.ll
+++ test/Transforms/CallSiteSplitting/callsite-split.ll
@@ -9,7 +9,7 @@
 ;CHECK-LABEL: @caller
 ;CHECK-LABEL: NextCond:
 ;CHECK: br {{.*}} label %callee.exit
-;CHECK-LABEL: CallSiteBB.predBB.split:
+;CHECK-LABEL: NextCond.split:
 ;CHECK: call void @callee(%struct.bitmap* null, %struct.bitmap* null, %struct.bitmap* %b_elt, i1 false)
 ;CHECK-LABEL: callee.exit:
 ;CHECK: call void @dummy2(%struct.bitmap* %a_elt)