diff --git a/bolt/include/bolt/Passes/SplitFunctions.h b/bolt/include/bolt/Passes/SplitFunctions.h
--- a/bolt/include/bolt/Passes/SplitFunctions.h
+++ b/bolt/include/bolt/Passes/SplitFunctions.h
@@ -23,6 +23,10 @@
   /// Split each function into a hot and cold fragment using profiling
   /// information.
   Profile2 = 0,
+  /// Split each function into a hot, warm and cold fragments using profiling
+  /// information. This is best used in combination with
+  /// `--reorder-blocker=ext-tsp`.
+  Profile3,
   /// Split each function into a hot and cold fragment at a randomly chosen
   /// split point (ignoring any available profiling information).
   Random2,
@@ -41,7 +45,8 @@
   virtual ~SplitStrategy() = default;
   virtual bool canSplit(const BinaryFunction &BF) = 0;
   virtual bool keepEmpty() = 0;
-  virtual void fragment(const BlockIt Start, const BlockIt End) = 0;
+  virtual void fragment(const BinaryFunction &BF, const BlockIt Start,
+                        const BlockIt End) = 0;
 };
 
 /// Split function code in multiple parts.
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <algorithm>
@@ -90,6 +91,11 @@
     cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
                           "split each function into a hot and cold fragment "
                           "using profiling information")),
+    cl::values(clEnumValN(
+        SplitFunctionsStrategy::Profile3, "profile3",
+        "split each function into a hot, warm and cold fragments using "
+        "profiling information. This is best used in combination with "
+        "--reorder-blocker=ext-tsp (experimental)")),
     cl::values(clEnumValN(
         SplitFunctionsStrategy::Random2, "random2",
         "split each function into a hot and cold fragment at a randomly chosen "
@@ -104,6 +110,12 @@
         "fragment contains exactly a single basic block")),
     cl::desc("strategy used to partition blocks into fragments"),
     cl::cat(BoltOptCategory));
+
+static cl::opt<double> SplitSizeCost(
+    "split-size-cost",
+    cl::desc("change the influence of the size of a block on it being moved "
+             "into a warm fragment (when --split-strategy=profile3)"),
+    cl::init(0.01), cl::cat(BoltOptCategory));
 } // namespace opts
 
 namespace {
@@ -126,7 +138,8 @@
 
   bool keepEmpty() override { return false; }
 
-  void fragment(const BlockIt Start, const BlockIt End) override {
+  void fragment(const BinaryFunction &BF, const BlockIt Start,
+                const BlockIt End) override {
     for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) {
       if (BB->getExecutionCount() == 0)
         BB->setFragmentNum(FragmentNum::cold());
@@ -134,6 +147,224 @@
   }
 };
 
+/// This strategy tries to partition the function into three fragments, where
+/// one fragment contains hot code, one fragments contains cold code, and one
+/// fragments contains not-so-hot (or warm) code that is also not cold code. It
+/// is not the goal to improve caching behavior of the local function, because
+/// when block-level reordering is applied, the warm part already sits right
+/// between the hot and cold parts. Instead, the target is to eliminate the
+/// effect that the presence of warm code has on other functions in the hot
+/// section while minimizing the potential cost of branching from hot code to
+/// far-away located warm code.
+///
+/// This implementation minimizes the cost function λ*s+b, where λ is a constant
+/// parameter, s is the size of each block in the hot fragment and b the
+/// frequency of branches taken from the hot to the warm fragment. It relies on
+/// blocks being ordered such that hotness is approximately decreasing (e.g.
+/// ext-tsp) and determines the optimal split point by calculating the cost at
+/// every block. All blocks before that point are assigned the hot fragment, all
+/// other blocks are either warm or cold.
+struct SplitProfile3 final : public SplitStrategy {
+  BinaryContext &BC;
+  double MedianExecutionCount;
+
+  explicit SplitProfile3(BinaryContext &BC)
+      : BC(BC), MedianExecutionCount(caluclateMedianExecutionCount(BC)) {}
+
+  bool canSplit(const BinaryFunction &BF) override {
+    return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
+  }
+
+  bool keepEmpty() override {
+    // If there are no blocks in the warm fragment, do not get rid of it.
+    return true;
+  }
+
+  void fragment(const BinaryFunction &BF, const BlockIt Start,
+                const BlockIt End) override {
+    BinaryContext::IndependentCodeEmitter Emitter;
+    if (!opts::NoThreads)
+      Emitter = BC.createIndependentMCCodeEmitter();
+
+    DenseMap<const BinaryBasicBlock *, size_t> Size;
+    for (const BinaryBasicBlock *const BB : make_range(Start, End))
+      Size[BB] = BB->estimateSize(Emitter.MCE.get());
+
+    // Initially, assume that the function is split right at the start, so all
+    // blocks are assigned to the warm (or cold) fragment.
+    BlockIt BestSplit = Start;
+
+    // Try to adjust size cost by block frequency. Using the same factor for
+    // almost cold functions as for super hot function will have the effect,
+    // that the almost cold functions will be entirely moved into the warm
+    // fragment - except their entry block (which is a current limitations). If
+    // those warm functions are clustered with other warm functions, moving
+    // everything but the entry into warm might hurt performance (jumping back
+    // and forth between hot and warm at each call). Hence, for functions that
+    // are less frequently called, reduce the size cost to split these less
+    // aggressively.
+    double SizeCost =
+        (1.0 - 1.0 / (1.0 + BF.getExecutionCount() / MedianExecutionCount)) *
+        opts::SplitSizeCost;
+    double CurrentCost = 0.0;
+
+    // Estimate fix cost for all non-outlineable blocks. These blocks are always
+    // hot regardless of where the function is split. Affects only entry points
+    // and exception landing pads (if no --split-eh). If the entry point is
+    // outlineable at some point in the future, add incoming cost based on its
+    // execution count to model cost of it being in the warm fragment (and
+    // being presumably called from hot blocks).
+    assert(!(*Start)->canOutline() &&
+           "First block is assumed to be not outlineable");
+    for (BlockIt It = Start; It != End; ++It) {
+      const BinaryBasicBlock *const BB = *It;
+
+      if (!BB->canOutline()) {
+        CurrentCost += Size[BB] * SizeCost;
+
+        for (const BinaryBasicBlock *const Predecessor : BB->predecessors()) {
+          // Add cost of all branches from predecessors that are outlineable.
+          if (Predecessor->getLayoutIndex() < BB->getLayoutIndex() &&
+              Predecessor->canOutline()) {
+            const BinaryBasicBlock::BinaryBranchInfo &Branch =
+                Predecessor->getBranchInfo(*BB);
+            CurrentCost += Branch.Count;
+          }
+        }
+
+        for (const auto Pair : zip(BB->successors(), BB->branch_info())) {
+          const BinaryBasicBlock *const Successor = std::get<0>(Pair);
+          const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair);
+
+          // Add cost of all branches to successors that are outlineable.
+          if (Successor->getLayoutIndex() > BB->getLayoutIndex() &&
+              Successor->canOutline())
+            CurrentCost += Branch.Count;
+        }
+      } else {
+        // Entry blocks are (usually) in hot. Add some cost on a call from warm
+        // to hot.
+        for (const MCInst &I : *BB) {
+          if (BC.MIB->isCall(I))
+            CurrentCost += BB->getExecutionCount() / 2.0;
+        }
+      }
+    }
+
+    double BestCost = CurrentCost;
+
+    // Now estimate the cost at each possible hot/warm splitpoint.
+    for (BlockIt It = Start; It != End; ++It) {
+      const BinaryBasicBlock *const BB = *It;
+
+      // If the block is not outlineable, its cost was already included in the
+      // initial estimation above.
+      if (BB->canOutline()) {
+        // Splitting after BB adds accumulated cost proportional to its weight.
+        CurrentCost += Size[BB] * SizeCost;
+
+        // If hot predecessors previously assumed this block would be warm (and
+        // thus cut the edge), subtract the added cost from those edges.
+        for (const BinaryBasicBlock *const Predecessor : BB->predecessors()) {
+          if (Predecessor->getLayoutIndex() < BB->getLayoutIndex()) {
+            const BinaryBasicBlock::BinaryBranchInfo &Branch =
+                Predecessor->getBranchInfo(*BB);
+            CurrentCost -= Branch.Count;
+          }
+        }
+
+        // If the cut is made after this block, add the cost of cutting all
+        // outgoing edges to non-hot blocks.
+        for (const auto Pair : zip(BB->successors(), BB->branch_info())) {
+          const BinaryBasicBlock *const Successor = std::get<0>(Pair);
+          const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair);
+
+          if (Successor->getLayoutIndex() > BB->getLayoutIndex()) {
+            // If the successor is outlineable, it is currently considered warm
+            // (while BB is hot) and the cost of that edge is added. If the
+            // successor is not outlineable however, remove the previously added
+            // cost.
+            //
+            // Subtracting cost could split a chain of blocks where some of them
+            // are outlineable and some of them are not (e.g. `not outlineable
+            // -> outlineable -> not outlineable -> huge outlineable` could
+            // split after the second not outlineable block). In this particular
+            // case, that is fine, because jumping back and forth between hot
+            // and warm is not desired while it still may be beneficial to cut
+            // of a huge tail. In practice, this pattern should not appear, as
+            // only exception landing pads and function entry points cannot be
+            // outlined.
+            if (BB->canOutline())
+              CurrentCost += Branch.Count;
+            else
+              CurrentCost -= Branch.Count;
+          }
+        }
+
+        // Now it's a call hot -> hot.
+        for (const MCInst &I : *BB) {
+          if (BC.MIB->isCall(I))
+            CurrentCost -= BB->getExecutionCount() / 2.0;
+        }
+      }
+
+      // Did the cost improve? If the cost goes down or stays the same, put the
+      // current block in hot.
+      if (CurrentCost <= BestCost) {
+        BestSplit = std::next(It);
+        BestCost = CurrentCost;
+      }
+    }
+
+    // Regardless of the split point, assign not executed blocks to the cold
+    // fragment. If --split-all-cold is set, they will actually be moved to the
+    // cold fragment. If not, they will remain in their respective hot/warm
+    // fragment.
+    const FragmentNum Warm(1);
+    const FragmentNum Cold(2);
+    for (BinaryBasicBlock *const BB : make_range(Start, BestSplit)) {
+      if (BB->getExecutionCount() > 0)
+        BB->setFragmentNum(FragmentNum::main());
+      else
+        BB->setFragmentNum(Cold);
+    }
+
+    for (BinaryBasicBlock *const BB : reverse(make_range(BestSplit, End))) {
+      if (BB->getExecutionCount() > 0)
+        BB->setFragmentNum(Warm);
+      else
+        BB->setFragmentNum(Cold);
+    }
+  }
+
+private:
+  static double caluclateMedianExecutionCount(const BinaryContext &BC) {
+    SmallVector<uint64_t, 0> CallFrequencies;
+    llvm::transform(BC.getBinaryFunctions(),
+                    std::back_inserter(CallFrequencies),
+                    [](const std::pair<const uint64_t, BinaryFunction> &BF) {
+                      return BF.second.getExecutionCount();
+                    });
+    llvm::erase_value(CallFrequencies, BinaryFunction::COUNT_NO_PROFILE);
+    llvm::erase_value(CallFrequencies, 0);
+    llvm::sort(CallFrequencies);
+
+    // If there is no profile, setting size cost to zero will keep blocks in
+    // main fragment.
+    double MedianCallFrequency = 0.0;
+    if (!CallFrequencies.empty()) {
+      MedianCallFrequency = CallFrequencies[CallFrequencies.size() / 2];
+      if (CallFrequencies.size() % 2 == 0)
+        MedianCallFrequency =
+            (MedianCallFrequency +
+             CallFrequencies[CallFrequencies.size() / 2 - 1]) /
+            2.0;
+    }
+
+    return 1.0 / MedianCallFrequency;
+  }
+};
+
 struct SplitRandom2 final : public SplitStrategy {
   std::minstd_rand0 Gen;
 
@@ -143,7 +374,8 @@
 
   bool keepEmpty() override { return false; }
 
-  void fragment(const BlockIt Start, const BlockIt End) override {
+  void fragment(const BinaryFunction &BF, const BlockIt Start,
+                const BlockIt End) override {
     using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
     const DiffT NumBlocks = End - Start;
     assert(NumBlocks > 0 && "Cannot fragment empty function");
@@ -170,7 +402,8 @@
 
   bool keepEmpty() override { return false; }
 
-  void fragment(const BlockIt Start, const BlockIt End) override {
+  void fragment(const BinaryFunction &BF, const BlockIt Start,
+                const BlockIt End) override {
     using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
     const DiffT NumBlocks = End - Start;
     assert(NumBlocks > 0 && "Cannot fragment empty function");
@@ -221,7 +454,8 @@
     return true;
   }
 
-  void fragment(const BlockIt Start, const BlockIt End) override {
+  void fragment(const BinaryFunction &BF, const BlockIt Start,
+                const BlockIt End) override {
     unsigned Fragment = 0;
     for (BinaryBasicBlock *const BB : llvm::make_range(Start, End))
       BB->setFragmentNum(FragmentNum(Fragment++));
@@ -244,11 +478,15 @@
 
   const size_t MainSize = FragmentSizes.lookup(FragmentNum::main());
   const size_t F1Size = FragmentSizes.lookup(FragmentNum(1));
+  const size_t F2Size = FragmentSizes.lookup(FragmentNum(2));
 
   Stats.emplace_back("hot", MainSize);
   if (FragmentSizes.size() == 2) {
     Stats.emplace_back("cold", F1Size);
-  } else if (FragmentSizes.size() > 2) {
+  } else if (FragmentSizes.size() == 3) {
+    Stats.emplace_back("warm", F1Size);
+    Stats.emplace_back("cold", F2Size);
+  } else if (FragmentSizes.size() > 3) {
     const size_t NonHotSize = TotalSize - MainSize;
     Stats.emplace_back("non-hot", NonHotSize);
   }
@@ -285,6 +523,9 @@
   case SplitFunctionsStrategy::Profile2:
     Strategy = std::make_unique<SplitProfile2>();
     break;
+  case SplitFunctionsStrategy::Profile3:
+    Strategy = std::make_unique<SplitProfile3>(BC);
+    break;
   case SplitFunctionsStrategy::Random2:
     Strategy = std::make_unique<SplitRandom2>();
     // If we split functions randomly, we need to ensure that across runs with
@@ -385,7 +626,7 @@
   }
 
   BF.getLayout().updateLayoutIndices();
-  S.fragment(NewLayout.begin(), NewLayout.end());
+  S.fragment(BF, NewLayout.begin(), NewLayout.end());
 
   // Make sure all non-outlineable blocks are in the main-fragment.
   for (BinaryBasicBlock *const BB : NewLayout) {
diff --git a/bolt/test/X86/split-profile3-chain.s b/bolt/test/X86/split-profile3-chain.s
new file mode 100644
--- /dev/null
+++ b/bolt/test/X86/split-profile3-chain.s
@@ -0,0 +1,101 @@
+# Test that chain of outlineable blocks is not split
+
+# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=profile3 \
+# RUN:         --split-size-cost=0.2 --print-split --print-only=chain \
+# RUN:         --data=%t.fdata --reorder-blocks=ext-tsp \
+# RUN:     2>&1 | FileCheck --check-prefix=NOSPLIT %s
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=profile3 \
+# RUN:         --split-size-cost=0.3 --print-split --print-only=chain \
+# RUN:         --data=%t.fdata --reorder-blocks=ext-tsp \
+# RUN:     2>&1 | FileCheck --check-prefix=SPLIT %s
+
+# NOSPLIT: Binary Function "chain"
+# NOSPLIT:   IsSplit     :
+# NOSPLIT-SAME: {{ 0$}}
+
+# SPLIT: Binary Function "chain"
+# SPLIT:   IsSplit     :
+# SPLIT-SAME: {{ 1$}}
+# SPLIT: {{^\.LBB00}}
+# SPLIT: {{^\.LFT0}}
+# SPLIT: {{^\.Ltmp1}}
+# SPLIT: -------   HOT-COLD SPLIT POINT   -------
+
+        .text
+        .globl  chain
+        .type   chain, @function
+chain:
+        pushq   %rbp
+        movq    %rsp, %rbp
+        cmpl    $2, %edi
+LLentry_LLchain_start:
+        jge     LLchain_start
+# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLchain_start# 0 10
+# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLfast# 0 50
+LLfast:
+        movl    $5, %eax
+LLfast_LLexit:
+        jmp     LLexit
+# FDATA: 1 chain #LLfast_LLexit# 1 chain #LLexit# 0 50
+LLchain_start:
+        movl    $10, %eax
+LLchain_start_LLchain1:
+        jmp     LLchain1
+# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLchain1# 0 10
+LLchain1:
+        addl    $1, %eax
+LLchain1_LLchain2:
+        jmp     LLchain2
+# FDATA: 1 chain #LLchain1_LLchain2# 1 chain #LLchain2# 0 10
+LLchain2:
+        addl    $1, %eax
+LLchain2_LLchain3:
+        jmp     LLchain3
+# FDATA: 1 chain #LLchain2_LLchain3# 1 chain #LLchain3# 0 10
+LLchain3:
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+LLchain3_LLchain4:
+        jmp     LLchain4
+# FDATA: 1 chain #LLchain3_LLchain4# 1 chain #LLchain4# 0 10
+LLchain4:
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+        addl    $1, %eax
+LLchain4_LLexit:
+        jmp     LLexit
+# FDATA: 1 chain #LLchain4_LLexit# 1 chain #LLexit# 0 10
+LLexit:
+        popq    %rbp
+        ret
+LLchain_end:
+        .size   chain, LLchain_end-chain
+
+
+        .globl  main
+        .type   main, @function
+main:
+        pushq   %rbp
+        movq    %rsp, %rbp
+        movl    $1, %edi
+LLmain_chain1:
+        call    chain
+# FDATA: 1 main #LLmain_chain1# 1 chain 0 0 50
+        movl    $4, %edi
+LLmain_chain2:
+        call    chain
+# FDATA: 1 main #LLmain_chain2# 1 chain 0 0 10
+        xorl    %eax, %eax
+        popq    %rbp
+        retq
+.Lmain_end:
+        .size   main, .Lmain_end-main
diff --git a/bolt/test/X86/split-profile3-empty-warm.s b/bolt/test/X86/split-profile3-empty-warm.s
new file mode 100644
--- /dev/null
+++ b/bolt/test/X86/split-profile3-empty-warm.s
@@ -0,0 +1,57 @@
+# Test that 3-way splitting does not remove warm fragment when empty
+
+# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=profile3 \
+# RUN:         --split-size-cost=0.1 --print-split --print-only=foo \
+# RUN:         --data=%t.fdata --reorder-blocks=ext-tsp \
+# RUN:     2>&1 | FileCheck %s
+
+# CHECK: Binary Function "foo"
+# CHECK:   IsSplit     :
+# CHECK-SAME: {{ 1$}}
+# CHECK: -------   HOT-COLD SPLIT POINT   -------
+# CHECK: -------   HOT-COLD SPLIT POINT   -------
+
+        .text
+        .globl  foo
+        .type   foo, @function
+foo:
+        pushq   %rbp
+        movq    %rsp, %rbp
+        cmpl    $2, %edi
+LLentry_LLcold:
+        jge     LLcold
+# FDATA: 1 foo #LLentry_LLcold# 1 foo #LLhot# 0 50
+LLhot:
+        movl    $5, %eax
+LLhot_LLexit:
+        jmp     LLexit
+# FDATA: 1 foo #LLhot_LLexit# 1 foo #LLexit# 0 50
+LLcold:
+        movl    $10, %eax
+        jmp     LLexit
+LLexit:
+        popq    %rbp
+        ret
+LLfoo_end:
+        .size   foo, LLfoo_end-foo
+
+
+        .globl  main
+        .type   main, @function
+main:
+        pushq   %rbp
+        movq    %rsp, %rbp
+        movl    $1, %edi
+LLmain_foo:
+        call    foo
+# FDATA: 1 main #LLmain_foo# 1 foo 0 0 50
+        movl    $4, %edi
+        xorl    %eax, %eax
+        popq    %rbp
+        retq
+.Lmain_end:
+        .size   main, .Lmain_end-main