Index: lib/Transforms/Scalar/CallSiteSplitting.cpp
===================================================================
--- lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -64,6 +64,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
@@ -73,6 +74,10 @@
 
 STATISTIC(NumCallSiteSplit, "Number of call-site split");
 
+/// Allow up to DuplicationThreshold instructions before call instructions.
+/// Those instructions need to be duplicated in all split blocks.
+static unsigned DuplicationThreshold = 5;
+
 static void addNonNullAttribute(Instruction *CallI, Instruction *NewCallI,
                                 Value *Op) {
   CallSite CS(NewCallI);
@@ -178,12 +183,13 @@
   if (!isa<CallInst>(Instr))
     return false;
 
-  // Allow splitting a call-site only when there is no instruction before the
-  // call-site in the basic block. Based on this constraint, we only clone the
-  // call instruction, and we do not move a call-site across any other
-  // instruction.
+  // Allow splitting a call-site only when there are less than
+  // DuplicationThreshold instructions before the call. The instructions
+  // before the call will be duplicated in the split blocks and corresponding
+  // uses will be updated.
   BasicBlock *CallSiteBB = Instr->getParent();
-  if (Instr != CallSiteBB->getFirstNonPHIOrDbg())
+  if (std::distance(CallSiteBB->getFirstNonPHIOrDbg()->getIterator(),
+                    Instr->getIterator()) >= DuplicationThreshold)
     return false;
 
   // Need 2 predecessors and cannot split an edge from an IndirectBrInst.
@@ -198,6 +204,7 @@
 struct SplitInfo {
   BasicBlock *BB;
   Instruction *CallInst;
+  DenseMap<const Value *, Value *> Mapping;
 };
 
 /// Return true if the CS is split into its new predecessors.
@@ -253,13 +260,13 @@
   SmallVector<SplitInfo, 2> Splits;
   for (const auto &P : Preds) {
     BasicBlock *PredBB = P.first;
-    BasicBlock *SplitBlock =
-        SplitBlockPredecessors(TailBB, PredBB, ".predBB.split");
+    ValueToValueMapTy Remap;
+    BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween(
+                TailBB, PredBB, &*std::next(Instr->getIterator()), Remap);
     assert(SplitBlock && "Unexpected new basic block split.");
 
-    Instruction *NewCI = Instr->clone();
+    Instruction *NewCI = &*std::prev(SplitBlock->getTerminator()->getIterator());
     addConditions(CS, NewCI, P.second);
-    NewCI->insertBefore(&*SplitBlock->getFirstInsertionPt());
     CallSite NewCS(NewCI);
 
     // Handle PHIs used as arguments in the call-site.
@@ -272,9 +279,19 @@
         ++ArgNo;
       }
     }
+
     DEBUG(dbgs() << "    " << *NewCI << " in " << SplitBlock->getName()
                  << "\n");
-    Splits.push_back({SplitBlock, NewCI});
+    SplitInfo SI;
+    SI.CallInst = NewCI;
+    SI.BB = SplitBlock;
+    // ValueToValueMapTy is not copy-able, but we need the mapping information
+    // for all split blocks later on, to insert PHI nodes. So we copy the
+    // mapping into a DenseMap for later use. We do not modify any mapped
+    // instructions until we use the collected maps to delete them.
+    for (const auto &KV : Remap)
+      SI.Mapping[KV.first] = KV.second;
+    Splits.push_back(SI);
   }
 
   // Replace users of the original call with a PHI mering call-sites split.
@@ -286,7 +303,26 @@
       CallPN->addIncoming(P.CallInst, P.BB);
     Instr->replaceAllUsesWith(CallPN);
   }
-  Instr->eraseFromParent();
+
+  // Remove instructions moved to split blocks from TailBB, from the duplicated
+  // call instruction to the beginning of the basic block. If an instruction
+  // has any uses, add a new PHI node to combine the values coming from the
+  // split blocks. By using reverse-order, we do not introduce unnecessary PHI
+  // nodes for def-use chains from the call instruction to the beginning of the
+  // block.
+  auto I = Instr->getReverseIterator();
+  auto End = std::next(TailBB->getFirstNonPHIOrDbg()->getReverseIterator());
+  while (I != End) {
+   Instruction *CurrentI = &*I++;
+   if (!CurrentI->use_empty()) {
+     PHINode *NewPN = PHINode::Create(CurrentI->getType(), 2);
+     for (auto &SI: Splits)
+       NewPN->addIncoming(SI.Mapping[CurrentI], SI.BB);
+     NewPN->insertBefore(&*TailBB->getFirstInsertionPt());
+     CurrentI->replaceAllUsesWith(NewPN);
+    }
+    CurrentI->eraseFromParent();
+  }
   NumCallSiteSplit++;
 }
 
Index: test/Transforms/CallSiteSplitting/callsite-instructions-before-call.ll
===================================================================
--- /dev/null
+++ test/Transforms/CallSiteSplitting/callsite-instructions-before-call.ll
@@ -0,0 +1,112 @@
+; RUN: opt -S  -callsite-splitting < %s | FileCheck %s
+
+; Instructions before a call that will be pushed to its predecessors
+; with uses after the callsite, must be patched up as PHI nodes in
+; the join block.
+define i32* @test_split_branch_phi(i32* %ptrarg, i32 %i) {
+Header:
+  %tobool = icmp ne i32* %ptrarg, null
+  br i1 %tobool, label %TBB, label %CallSite
+
+TBB:                                    ; preds = %Header
+  %arrayidx = getelementptr inbounds i32, i32* %ptrarg, i64 42
+  %0 = load i32, i32* %arrayidx, align 4
+  %tobool1 = icmp ne i32 %0, 0
+  br i1 %tobool1, label %CallSite, label %End
+
+CallSite:                                          ; preds = %TBB, %Header
+  %somepointer = getelementptr i32, i32* %ptrarg, i64 18
+  call void @bar(i32* %ptrarg, i32 %i)
+  br label %End
+
+End:                                           ; preds = %CallSite, %TBB
+  %somepointerphi = phi i32* [ %somepointer, %CallSite ], [ null, %TBB ]
+  ret i32* %somepointerphi
+}
+; CHECK-LABEL: test_split_branch_phi
+; CHECK-LABEL: Header.split
+; CHECK: %[[V1:somepointer[0-9]+]] = getelementptr i32, i32* %ptrarg, i64 18
+; CHECK: call void @bar(i32* null, i32 %i)
+; CHECK: br label %CallSite
+; CHECK-LABEL: TBB.split:
+; CHECK: %[[V2:somepointer[0-9]+]] = getelementptr i32, i32* %ptrarg, i64 18
+; CHECK: call void @bar(i32* nonnull %ptrarg, i32 %i)
+; CHECK: br label %CallSite
+; CHECK: CallSite:
+; CHECK: phi i32* [ %[[V1]], %Header.split ], [ %[[V2]], %TBB.split ]
+
+
+; Instructions before a call that will be pushed to its predecessors
+; with uses after the callsite, must be patched up as PHI nodes in
+; the join block.
+define void @split_branch_no_extra_phi(i32* %ptrarg, i32 %i) {
+Header:
+  %tobool = icmp ne i32* %ptrarg, null
+  br i1 %tobool, label %TBB, label %CallSite
+
+TBB:                                    ; preds = %Header
+  %arrayidx = getelementptr inbounds i32, i32* %ptrarg, i64 42
+  %0 = load i32, i32* %arrayidx, align 4
+  %tobool1 = icmp ne i32 %0, 0
+  br i1 %tobool1, label %CallSite, label %End
+
+CallSite:                                          ; preds = %TBB, %Header
+  %i.add = add i32 %i, 99
+  call void @bar(i32* %ptrarg, i32 %i.add)
+  br label %End
+
+End:                                           ; preds = %CallSite, %TBB
+  ret void
+}
+; CHECK-LABEL: split_branch_no_extra_phi
+; CHECK-LABEL: Header.split
+; CHECK: %[[V1:.+]] = add i32 %i, 99
+; CHECK: call void @bar(i32* null, i32 %[[V1]])
+; CHECK: br label %CallSite
+; CHECK-LABEL: TBB.split:
+; CHECK: %[[V2:.+]] = add i32 %i, 99
+; CHECK: call void @bar(i32* nonnull %ptrarg, i32 %[[V2]])
+; CHECK: br label %CallSite
+; CHECK: CallSite:
+; CHECK-NOT: phi
+
+
+; In this test case, DuplicationThreshold (5) instructions are before
+; the call site.
+define void @test_no_split_threshold(i32* %ptrarg, i32 %i) {
+Header:
+  %tobool = icmp ne i32* %ptrarg, null
+  br i1 %tobool, label %TBB, label %CallSite
+
+TBB:                                    ; preds = %Header
+  %arrayidx = getelementptr inbounds i32, i32* %ptrarg, i64 42
+  %0 = load i32, i32* %arrayidx, align 4
+  %tobool1 = icmp ne i32 %0, 0
+  br i1 %tobool1, label %CallSite, label %End
+
+CallSite:                                          ; preds = %TBB, %Header
+  %idx = getelementptr inbounds i32, i32* %ptrarg, i64 32
+  %v1 = load i32, i32* %idx, align 4
+  %a1 = add i32 %i, %v1
+  %a2 = mul i32 %a1, %v1
+  %a3 = sdiv i32 %a2, 99
+  call void @bar(i32* %ptrarg, i32 %a3)
+  store i32 %a3, i32* %idx, align 4
+  br label %End
+
+End:                                           ; preds = %CallSite, %TBB
+  ret void
+}
+; CHECK-LABEL: test_no_split_threshold
+; CHECK-NOT: split
+; CHECK-LABEL: CallSite:
+; CHECK: call void @bar(i32* %ptrarg, i32 %a3)
+
+
+define void @bar(i32*, i32) {
+    ret void
+}
+
+define  void @bari(i32) {
+    ret void
+}
Index: test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll
===================================================================
--- test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll
+++ test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll
@@ -3,15 +3,15 @@
 
 ; CHECK-LABEL: @test_simple
 ; CHECK-LABEL: Header:
-; CHECK-NEXT: br i1 undef, label %Tail.predBB.split
-; CHECK-LABEL: TBB:
-; CHECK: br i1 %cmp, label %Tail.predBB.split1
-; CHECK-LABEL: Tail.predBB.split:
+; CHECK-NEXT: br i1 undef, label %Header.split
+; CHECK-LABEL: Header.split:
 ; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 %p)
-; CHECK-LABEL: Tail.predBB.split1:
+; CHECK-LABEL: TBB:
+; CHECK: br i1 %cmp, label %TBB.split
+; CHECK-LABEL: TBB.split:
 ; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 %p)
 ; CHECK-LABEL: Tail
-; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB.split ], [ %[[CALL2]], %Tail.predBB.split1 ]
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header.split ], [ %[[CALL2]], %TBB.split ]
 ; CHECK: ret i32 %[[MERGED]]
 define i32 @test_simple(i32* %a, i32 %v, i32 %p) {
 Header:
@@ -31,15 +31,15 @@
 
 ; CHECK-LABEL: @test_eq_eq_eq_untaken
 ; CHECK-LABEL: Header:
-; CHECK: br i1 %tobool1, label %TBB1, label %Tail.predBB.split
-; CHECK-LABEL: TBB2:
-; CHECK: br i1 %cmp2, label %Tail.predBB.split1, label %End
-; CHECK-LABEL: Tail.predBB.split:
+; CHECK: br i1 %tobool1, label %TBB1, label %Header.split
+; CHECK-LABEL: Header.split:
 ; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p)
-; CHECK-LABEL: Tail.predBB.split1:
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %TBB2.split, label %End
+; CHECK-LABEL: TBB2.split:
 ; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 99)
 ; CHECK-LABEL: Tail
-; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB.split ], [ %[[CALL2]], %Tail.predBB.split1 ]
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header.split ], [ %[[CALL2]], %TBB2.split ]
 ; CHECK: ret i32 %[[MERGED]]
 define i32 @test_eq_eq_eq_untaken2(i32* %a, i32 %v, i32 %p) {
 Header:
@@ -64,15 +64,15 @@
 
 ; CHECK-LABEL: @test_eq_ne_eq_untaken
 ; CHECK-LABEL: Header:
-; CHECK: br i1 %tobool1, label %TBB1, label %Tail.predBB.split
-; CHECK-LABEL: TBB2:
-; CHECK: br i1 %cmp2, label %Tail.predBB.split1, label %End
-; CHECK-LABEL: Tail.predBB.split:
+; CHECK: br i1 %tobool1, label %TBB1, label %Header.split
+; CHECK-LABEL: Header.split:
 ; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p)
-; CHECK-LABEL: Tail.predBB.split1:
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %TBB2.split, label %End
+; CHECK-LABEL: TBB2.split:
 ; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 99)
 ; CHECK-LABEL: Tail
-; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB.split ], [ %[[CALL2]], %Tail.predBB.split1 ]
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header.split ], [ %[[CALL2]], %TBB2.split ]
 ; CHECK: ret i32 %[[MERGED]]
 define i32 @test_eq_ne_eq_untaken(i32* %a, i32 %v, i32 %p) {
 Header:
@@ -97,17 +97,17 @@
 
 ; CHECK-LABEL: @test_header_header2_tbb
 ; CHECK: Header2:
-; CHECK:br i1 %tobool2, label %Tail.predBB.split, label %TBB1
-; CHECK-LABEL: TBB2:
-; CHECK: br i1 %cmp2, label %Tail.predBB.split1, label %End
-; CHECK-LABEL: Tail.predBB.split:
+; CHECK:br i1 %tobool2, label %Header2.split, label %TBB1
+; CHECK-LABEL: Header2.split:
 ; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 10)
-; CHECK-LABEL: Tail.predBB.split1:
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %TBB2.split, label %End
+; CHECK-LABEL: TBB2.split:
 ; NOTE: CallSiteSplitting cannot infer that %a is null here, as it currently
 ;       only supports recording conditions along a single predecessor path.
 ; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 99)
 ; CHECK-LABEL: Tail
-; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB.split ], [ %[[CALL2]], %Tail.predBB.split1 ]
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Header2.split ], [ %[[CALL2]], %TBB2.split ]
 ; CHECK: ret i32 %[[MERGED]]
 define i32 @test_header_header2_tbb(i32* %a, i32 %v, i32 %p) {
 Header:
Index: test/Transforms/CallSiteSplitting/callsite-split-debug.ll
===================================================================
--- test/Transforms/CallSiteSplitting/callsite-split-debug.ll
+++ test/Transforms/CallSiteSplitting/callsite-split-debug.ll
@@ -48,10 +48,9 @@
 ; CallSiteBB.
 
 ; CHECK-LABEL: @foo
-; CHECK-LABEL: CallsiteBB.predBB.split:
-; CHECK: [[TMP1:%[0-9]+]] = call i16 @bar(i16 1, i16 5)
-; CHECK-LABEL: CallsiteBB.predBB.split1:
-; CHECK: [[TMP2:%[0-9]+]] = call i16 @bar(i16 0, i16 5)
+; CHECK-LABEL: bb1.split:
+; CHECK: [[TMP1:%[0-9]+]] = call i16 @bar(i16 0, i16 5)
+; CHECK-LABEL: bb2.split:
+; CHECK: [[TMP2:%[0-9]+]] = call i16 @bar(i16 1, i16 5)
 ; CHECK-LABEL: CallsiteBB
-; CHECK: %phi.call = phi i16 [ [[TMP1]], %CallsiteBB.predBB.split ], [ [[TMP2]], %CallsiteBB.predBB.split1
-
+; CHECK: %phi.call = phi i16 [ [[TMP2]], %bb2.split ], [ [[TMP1]], %bb1.split
Index: test/Transforms/CallSiteSplitting/callsite-split.ll
===================================================================
--- test/Transforms/CallSiteSplitting/callsite-split.ll
+++ test/Transforms/CallSiteSplitting/callsite-split.ll
@@ -9,7 +9,7 @@
 ;CHECK-LABEL: @caller
 ;CHECK-LABEL: NextCond:
 ;CHECK: br {{.*}} label %callee.exit
-;CHECK-LABEL: CallSiteBB.predBB.split:
+;CHECK-LABEL: NextCond.split:
 ;CHECK: call void @callee(%struct.bitmap* null, %struct.bitmap* null, %struct.bitmap* %b_elt, i1 false)
 ;CHECK-LABEL: callee.exit:
 ;CHECK: call void @dummy2(%struct.bitmap* %a_elt)