diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -242,6 +242,10 @@
            (LineOffset == O.LineOffset && Discriminator < O.Discriminator);
   }
 
+  bool operator==(const LineLocation &O) const {
+    return LineOffset == O.LineOffset && Discriminator == O.Discriminator;
+  }
+
   uint32_t LineOffset;
   uint32_t Discriminator;
 };
@@ -339,6 +343,129 @@
 
 raw_ostream &operator<<(raw_ostream &OS, const SampleRecord &Sample);
 
+// State of context associated with FunctionSamples
+enum ContextStateMask {
+  UnknownContext = 0x0,   // Profile without context
+  RawContext = 0x1,       // Full context profile from input profile
+  SyntheticContext = 0x2, // Synthetic context created for context promotion
+  InlinedContext = 0x4,   // Profile for context that is inlined into caller
+  MergedContext = 0x8     // Profile for context merged into base profile
+};
+
+// Sample context for FunctionSamples. It consists of the calling context,
+// the function name and context state. Internally sample context is represented
+// using StringRef, which is also the input for constructing a `SampleContext`.
+// It can accept and represent both full context string as well as context-less
+// function name.
+// Example of full context string (note the wrapping `[]`):
+//    `[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]`
+// Example of context-less function name (same as AutoFDO):
+//    `_Z8funcLeafi`
+class SampleContext {
+public:
+  SampleContext() : State(UnknownContext) {}
+  SampleContext(StringRef ContextStr,
+                ContextStateMask CState = UnknownContext) {
+    setContext(ContextStr, CState);
+  }
+
+  // Promote context by removing top frames (represented by `ContextStrToRemove`).
+  // Note that with string representation of context, the promotion is effectively
+  // a substr operation with `ContextStrToRemove` removed from left.
+  void promoteOnPath(StringRef ContextStrToRemove) {
+    assert(FullContext.startswith(ContextStrToRemove));
+
+    // Remove leading context and frame separator " @ ".
+    FullContext = FullContext.substr(ContextStrToRemove.size() + 3);
+    CallingContext = CallingContext.substr(ContextStrToRemove.size() + 3);
+  }
+
+  // Split the top context frame (left-most substr) from context.
+  static std::pair<StringRef, StringRef>
+  splitContextString(StringRef ContextStr) {
+    return ContextStr.split(" @ ");
+  }
+
+  // Decode context string for a frame to get function name and location.
+  // `ContextStr` is in the form of `FuncName:StartLine.Discriminator`.
+  static void decodeContextString(StringRef ContextStr, StringRef &FName,
+                                  LineLocation &LineLoc) {
+    // Get function name
+    auto EntrySplit = ContextStr.split(':');
+    FName = EntrySplit.first;
+
+    LineLoc = {0, 0};
+    if (!EntrySplit.second.empty()) {
+      // Get line offset, use signed int for getAsInteger so string will
+      // be parsed as signed.
+      int LineOffset = 0;
+      auto LocSplit = EntrySplit.second.split('.');
+      LocSplit.first.getAsInteger(10, LineOffset);
+      LineLoc.LineOffset = LineOffset;
+
+      // Get discriminator
+      if (!LocSplit.second.empty())
+        LocSplit.second.getAsInteger(10, LineLoc.Discriminator);
+    }
+  }
+
+  operator StringRef() const { return FullContext; }
+  bool hasState(ContextStateMask S) { return State & (uint32_t)S; }
+  void setState(ContextStateMask S) { State |= (uint32_t)S; }
+  void clearState(ContextStateMask S) { State &= (uint32_t)~S; }
+  bool hasContext() const { return State != UnknownContext; }
+  bool isBaseContext() const { return CallingContext.empty(); }
+  StringRef getName() const { return Name; }
+  StringRef getCallingContext() const { return CallingContext; }
+  StringRef getNameWithContext() const { return FullContext; }
+
+private:
+  // Give a context string, decode and populate internal states like
+  // Function name, Calling context and context state. Example of input
+  // `ContextStr`: `[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]`
+  void setContext(StringRef ContextStr, ContextStateMask CState) {
+    assert(!ContextStr.empty());
+    // Note that `[]` wrapped input indicates a full context string, otherwise
+    // it's treated as context-less function name only.
+    bool HasContext = ContextStr.startswith("[");
+    if (!HasContext && CState == UnknownContext) {
+      State = UnknownContext;
+      Name = FullContext = ContextStr;
+    } else {
+      // Assume raw context profile if unspecified
+      if (CState == UnknownContext)
+        State = RawContext;
+      else
+        State = CState;
+
+      // Remove encapsulating '[' and ']' if any
+      if (HasContext)
+        FullContext = ContextStr.substr(1, ContextStr.size() - 2);
+      else
+        FullContext = ContextStr;
+
+      // Caller is to the left of callee in context string
+      auto NameContext = FullContext.rsplit(" @ ");
+      if (NameContext.second.empty()) {
+        Name = NameContext.first;
+        CallingContext = NameContext.second;
+      } else {
+        Name = NameContext.second;
+        CallingContext = NameContext.first;
+      }
+    }
+  }
+
+  // Full context string including calling context and leaf function name
+  StringRef FullContext;
+  // Function name for the associated sample profile
+  StringRef Name;
+  // Calling context (leaf function excluded) for the associated sample profile
+  StringRef CallingContext;
+  // State of the associated sample profile
+  uint32_t State;
+};
+
 class FunctionSamples;
 class SampleProfileReaderItaniumRemapper;
 
@@ -396,10 +523,16 @@
   ErrorOr<uint64_t> findSamplesAt(uint32_t LineOffset,
                                   uint32_t Discriminator) const {
     const auto &ret = BodySamples.find(LineLocation(LineOffset, Discriminator));
-    if (ret == BodySamples.end())
+    if (ret == BodySamples.end()) {
+      // For CSSPGO, in order to conserve profile size, we no longer write out
+      // locations profile for those not hit during training, so we need to
+      // treat them as zero instead of error here.
+      if (ProfileIsCS)
+        return 0;
       return std::error_code();
-    else
+    } else {
       return ret->second.getSamples();
+    }
   }
 
   /// Returns the call target map collected at a given location.
@@ -615,6 +748,12 @@
       const DILocation *DIL,
       SampleProfileReaderItaniumRemapper *Remapper = nullptr) const;
 
+  static bool ProfileIsCS;
+
+  SampleContext &getContext() const { return Context; }
+
+  void setContext(const SampleContext &FContext) { Context = FContext; }
+
   static SampleProfileFormat Format;
 
   /// Whether the profile uses MD5 to represent string.
@@ -639,6 +778,9 @@
   /// Mangled name of the function.
   StringRef Name;
 
+  /// Calling context for function profile
+  mutable SampleContext Context;
+
   /// Total number of samples collected inside this function.
   ///
   /// Samples are cumulative, they include all the samples collected
diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -419,6 +419,9 @@
   /// \brief Return the profile format.
   SampleProfileFormat getFormat() const { return Format; }
 
+  /// Whether input profile is fully context-sensitie
+  bool profileIsCS() const { return ProfileIsCS; }
+
   virtual std::unique_ptr<ProfileSymbolList> getProfileSymbolList() {
     return nullptr;
   };
@@ -461,6 +464,8 @@
 
   std::unique_ptr<SampleProfileReaderItaniumRemapper> Remapper;
 
+  bool ProfileIsCS = false;
+
   /// \brief The format of sample.
   SampleProfileFormat Format = SPF_None;
 };
diff --git a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
@@ -0,0 +1,141 @@
+//===- Transforms/IPO/SampleContextTracker.h --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file provides the interface for context-sensitive profile tracker used
+/// by CSSPGO.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_SAMPLECONTEXTTRACKER_H
+#define LLVM_TRANSFORMS_IPO_SAMPLECONTEXTTRACKER_H
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include <list>
+#include <map>
+
+using namespace llvm;
+using namespace sampleprof;
+
+namespace llvm {
+
+// Internal trie tree representation used for tracking context tree and sample
+// profiles. The path from root node to a given node represents the context of
+// that nodes' profile.
+class ContextTrieNode {
+public:
+  ContextTrieNode(ContextTrieNode *Parent = nullptr,
+                  StringRef FName = StringRef(),
+                  FunctionSamples *FSamples = nullptr,
+                  LineLocation CallLoc = {0, 0})
+      : ParentContext(Parent), FuncName(FName), FuncSamples(FSamples),
+        CallSiteLoc(CallLoc){};
+  ContextTrieNode *getChildContext(const LineLocation &CallSite,
+                                   StringRef CalleeName);
+  ContextTrieNode *getChildContext(const LineLocation &CallSite);
+  ContextTrieNode *getOrCreateChildContext(const LineLocation &CallSite,
+                                           StringRef CalleeName,
+                                           bool AllowCreate = true);
+
+  ContextTrieNode &moveToChildContext(const LineLocation &CallSite,
+                                      ContextTrieNode &&NodeToMove,
+                                      StringRef ContextStrToRemove,
+                                      bool DeleteNode = true);
+  void removeChildContext(const LineLocation &CallSite, StringRef CalleeName);
+  std::map<uint32_t, ContextTrieNode> &getAllChildContext();
+  const StringRef getFuncName() const;
+  FunctionSamples *getFunctionSamples() const;
+  void setFunctionSamples(FunctionSamples *FSamples);
+  LineLocation getCallSiteLoc() const;
+  ContextTrieNode *getParentContext() const;
+  void setParentContext(ContextTrieNode *Parent);
+  void dump();
+
+private:
+  static uint32_t nodeHash(StringRef ChildName, const LineLocation &Callsite);
+
+  // Map line+discriminator location to child context
+  std::map<uint32_t, ContextTrieNode> AllChildContext;
+
+  // Link to parent context node
+  ContextTrieNode *ParentContext;
+
+  // Function name for current context
+  StringRef FuncName;
+
+  // Function Samples for current context
+  FunctionSamples *FuncSamples;
+
+  // Callsite location in parent context
+  LineLocation CallSiteLoc;
+};
+
+// Profile tracker that manages profiles and its associated context. It
+// provides interfaces used by sample profile loader to query context profile or
+// base profile for given function or location; it also manages context tree
+// manipulation that is needed to accommodate inline decisions so we have
+// accurate post-inline profile for functions. Internally context profiles
+// are organized in a trie, with each node representing profile for specific
+// calling context and the context is identified by path from root to the node.
+class SampleContextTracker {
+public:
+  SampleContextTracker(StringMap<FunctionSamples> &Profiles);
+  // Query context profile for a specific callee with given name at a given
+  // call-site. The full context is identified by location of call instruction.
+  FunctionSamples *getCalleeContextSamplesFor(const CallBase &Inst,
+                                              StringRef CalleeName);
+  // Query context profile for a given location. The full context
+  // is identified by input DILocation.
+  FunctionSamples *getContextSamplesFor(const DILocation *DIL);
+  // Query context profile for a given sample contxt of a function.
+  FunctionSamples *getContextSamplesFor(const SampleContext &Context);
+  // Query base profile for a given function. A base profile is a merged view
+  // of all context profiles for contexts that are not inlined.
+  FunctionSamples *getBaseSamplesFor(const Function &Func,
+                                     bool MergeContext = true);
+  // Query base profile for a given function by name.
+  FunctionSamples *getBaseSamplesFor(StringRef Name, bool MergeContext);
+  // Mark a context profile as inlined when function is inlined.
+  // This makes sure that inlined context profile will be excluded in
+  // function's base profile.
+  void markContextSamplesInlined(const FunctionSamples *InlinedSamples);
+  // Dump the internal context profile trie.
+  void dump();
+
+private:
+  ContextTrieNode *getContextFor(const DILocation *DIL);
+  ContextTrieNode *getContextFor(const SampleContext &Context);
+  ContextTrieNode *getCalleeContextFor(const DILocation *DIL,
+                                       StringRef CalleeName);
+  ContextTrieNode *getOrCreateContextPath(const SampleContext &Context,
+                                          bool AllowCreate);
+  ContextTrieNode *getTopLevelContextNode(StringRef FName);
+  ContextTrieNode &addTopLevelContextNode(StringRef FName);
+  ContextTrieNode &promoteMergeContextSamplesTree(ContextTrieNode &NodeToPromo);
+  void promoteMergeContextSamplesTree(const Instruction &Inst,
+                                      StringRef CalleeName);
+  void mergeContextNode(ContextTrieNode &FromNode, ContextTrieNode &ToNode,
+                        StringRef ContextStrToRemove);
+  ContextTrieNode &promoteMergeContextSamplesTree(ContextTrieNode &FromNode,
+                                                  ContextTrieNode &ToNodeParent,
+                                                  StringRef ContextStrToRemove);
+
+  // Map from function name to context profiles (excluding base profile)
+  StringMap<SmallSet<FunctionSamples *, 16>> FuncToCtxtProfileSet;
+
+  // Root node for context trie tree
+  ContextTrieNode RootContext;
+};
+
+} // end namespace llvm
+#endif // LLVM_TRANSFORMS_IPO_SAMPLECONTEXTTRACKER_H
diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp
--- a/llvm/lib/ProfileData/SampleProf.cpp
+++ b/llvm/lib/ProfileData/SampleProf.cpp
@@ -31,6 +31,7 @@
 namespace llvm {
 namespace sampleprof {
 SampleProfileFormat FunctionSamples::Format;
+bool FunctionSamples::ProfileIsCS = false;
 bool FunctionSamples::UseMD5;
 } // namespace sampleprof
 } // namespace llvm
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -196,6 +196,8 @@
   sampleprof_error Result = sampleprof_error::success;
 
   InlineCallStack InlineStack;
+  int CSProfileCount = 0;
+  int RegularProfileCount = 0;
 
   for (; !LineIt.is_at_eof(); ++LineIt) {
     if ((*LineIt)[(*LineIt).find_first_not_of(' ')] == '#')
@@ -220,9 +222,15 @@
                     "Expected 'mangled_name:NUM:NUM', found " + *LineIt);
         return sampleprof_error::malformed;
       }
-      Profiles[FName] = FunctionSamples();
-      FunctionSamples &FProfile = Profiles[FName];
-      FProfile.setName(FName);
+      SampleContext FContext(FName);
+      if (FContext.hasContext())
+        ++CSProfileCount;
+      else
+        ++RegularProfileCount;
+      Profiles[FContext] = FunctionSamples();
+      FunctionSamples &FProfile = Profiles[FContext];
+      FProfile.setName(FContext.getName());
+      FProfile.setContext(FContext);
       MergeResult(Result, FProfile.addTotalSamples(NumSamples));
       MergeResult(Result, FProfile.addHeadSamples(NumHeadSamples));
       InlineStack.clear();
@@ -264,6 +272,11 @@
       }
     }
   }
+
+  assert((RegularProfileCount == 0 || CSProfileCount == 0) &&
+         "Cannot have both context-sensitive and regular profile");
+  ProfileIsCS = (CSProfileCount > 0);
+
   if (Result == sampleprof_error::success)
     computeSummary();
 
@@ -1292,6 +1305,8 @@
     return;
   }
 
+  // CSSPGO-TODO: Remapper is not yet supported.
+  // We will need to remap the entire context string.
   assert(Remappings && "should be initialized while creating remapper");
   for (auto &Sample : Reader.getProfiles()) {
     DenseSet<StringRef> NamesInSample;
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -31,6 +31,7 @@
   PartialInlining.cpp
   PassManagerBuilder.cpp
   PruneEH.cpp
+  SampleContextTracker.cpp
   SampleProfile.cpp
   SampleProfileProbe.cpp
   SCCP.cpp
diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -0,0 +1,521 @@
+//===- SampleContextTracker.cpp - Context-sensitive Profile Tracker -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SampleContextTracker used by CSSPGO.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/SampleContextTracker.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include <map>
+#include <queue>
+#include <vector>
+
+using namespace llvm;
+using namespace sampleprof;
+
+#define DEBUG_TYPE "sample-context-tracker"
+
+namespace llvm {
+
+ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite,
+                                                  StringRef CalleeName) {
+  if (CalleeName.empty())
+    return getChildContext(CallSite);
+
+  uint32_t Hash = nodeHash(CalleeName, CallSite);
+  auto It = AllChildContext.find(Hash);
+  if (It != AllChildContext.end())
+    return &It->second;
+  return nullptr;
+}
+
+ContextTrieNode *
+ContextTrieNode::getChildContext(const LineLocation &CallSite) {
+  // CSFDO-TODO: This could be slow, change AllChildContext so we can
+  // do point look up for child node by call site alone.
+  // CSFDO-TODO: Return the child with max count for indirect call
+  ContextTrieNode *ChildNodeRet = nullptr;
+  for (auto &It : AllChildContext) {
+    ContextTrieNode &ChildNode = It.second;
+    if (ChildNode.CallSiteLoc == CallSite) {
+      if (ChildNodeRet)
+        return nullptr;
+      else
+        ChildNodeRet = &ChildNode;
+    }
+  }
+
+  return ChildNodeRet;
+}
+
+ContextTrieNode &ContextTrieNode::moveToChildContext(
+    const LineLocation &CallSite, ContextTrieNode &&NodeToMove,
+    StringRef ContextStrToRemove, bool DeleteNode) {
+  uint32_t Hash = nodeHash(NodeToMove.getFuncName(), CallSite);
+  assert(!AllChildContext.count(Hash) && "Node to remove must exist");
+  LineLocation OldCallSite = NodeToMove.CallSiteLoc;
+  ContextTrieNode &OldParentContext = *NodeToMove.getParentContext();
+  AllChildContext[Hash] = NodeToMove;
+  ContextTrieNode &NewNode = AllChildContext[Hash];
+  NewNode.CallSiteLoc = CallSite;
+
+  // Walk through nodes in the moved the subtree, and update
+  // FunctionSamples' context as for the context promotion.
+  // We also need to set new parant link for all children.
+  std::queue<ContextTrieNode *> NodeToUpdate;
+  NewNode.setParentContext(this);
+  NodeToUpdate.push(&NewNode);
+
+  while (!NodeToUpdate.empty()) {
+    ContextTrieNode *Node = NodeToUpdate.front();
+    NodeToUpdate.pop();
+    FunctionSamples *FSamples = Node->getFunctionSamples();
+
+    if (FSamples) {
+      FSamples->getContext().promoteOnPath(ContextStrToRemove);
+      FSamples->getContext().setState(SyntheticContext);
+      LLVM_DEBUG(dbgs() << "  Context promoted to: " << FSamples->getContext()
+                        << "\n");
+    }
+
+    for (auto &It : Node->getAllChildContext()) {
+      ContextTrieNode *ChildNode = &It.second;
+      ChildNode->setParentContext(Node);
+      NodeToUpdate.push(ChildNode);
+    }
+  }
+
+  // Original context no longer needed, destroy if requested.
+  if (DeleteNode)
+    OldParentContext.removeChildContext(OldCallSite, NewNode.getFuncName());
+
+  return NewNode;
+}
+
+void ContextTrieNode::removeChildContext(const LineLocation &CallSite,
+                                         StringRef CalleeName) {
+  uint32_t Hash = nodeHash(CalleeName, CallSite);
+  // Note this essentially calls dtor and destroys that child context
+  AllChildContext.erase(Hash);
+}
+
+std::map<uint32_t, ContextTrieNode> &ContextTrieNode::getAllChildContext() {
+  return AllChildContext;
+}
+
+const StringRef ContextTrieNode::getFuncName() const { return FuncName; }
+
+FunctionSamples *ContextTrieNode::getFunctionSamples() const {
+  return FuncSamples;
+}
+
+void ContextTrieNode::setFunctionSamples(FunctionSamples *FSamples) {
+  FuncSamples = FSamples;
+}
+
+LineLocation ContextTrieNode::getCallSiteLoc() const { return CallSiteLoc; }
+
+ContextTrieNode *ContextTrieNode::getParentContext() const {
+  return ParentContext;
+}
+
+void ContextTrieNode::setParentContext(ContextTrieNode *Parent) {
+  ParentContext = Parent;
+}
+
+void ContextTrieNode::dump() {
+  dbgs() << "Node: " << FuncName << "\n"
+         << "  Callsite: " << CallSiteLoc << "\n"
+         << "  Children:\n";
+
+  for (auto &It : AllChildContext) {
+    dbgs() << "    Node: " << It.second.getFuncName() << "\n";
+  }
+}
+
+uint32_t ContextTrieNode::nodeHash(StringRef ChildName,
+                                   const LineLocation &Callsite) {
+  // We still use child's name for child hash, this is
+  // because for children of root node, we don't have
+  // different line/discriminator, and we'll rely on name
+  // to differentiate children.
+  uint32_t NameHash = std::hash<std::string>{}(ChildName.str());
+  uint32_t LocId = (Callsite.LineOffset << 16) | Callsite.Discriminator;
+  return NameHash + (LocId << 5) + LocId;
+}
+
+ContextTrieNode *ContextTrieNode::getOrCreateChildContext(
+    const LineLocation &CallSite, StringRef CalleeName, bool AllowCreate) {
+  uint32_t Hash = nodeHash(CalleeName, CallSite);
+  auto It = AllChildContext.find(Hash);
+  if (It != AllChildContext.end()) {
+    assert(It->second.getFuncName() == CalleeName &&
+           "Hash collision for child context node");
+    return &It->second;
+  }
+
+  if (!AllowCreate)
+    return nullptr;
+
+  AllChildContext[Hash] = ContextTrieNode(this, CalleeName, nullptr, CallSite);
+  return &AllChildContext[Hash];
+}
+
+// Profiler tracker than manages profiles and its associated context
+SampleContextTracker::SampleContextTracker(
+    StringMap<FunctionSamples> &Profiles) {
+  for (auto &FuncSample : Profiles) {
+    FunctionSamples *FSamples = &FuncSample.second;
+    SampleContext Context(FuncSample.first(), RawContext);
+    LLVM_DEBUG(dbgs() << "Tracking Context for function: " << Context << "\n");
+    if (!Context.isBaseContext())
+      FuncToCtxtProfileSet[Context.getName()].insert(FSamples);
+    ContextTrieNode *NewNode = getOrCreateContextPath(Context, true);
+    assert(!NewNode->getFunctionSamples() &&
+           "New node can't have sample profile");
+    NewNode->setFunctionSamples(FSamples);
+  }
+}
+
+FunctionSamples *
+SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst,
+                                                 StringRef CalleeName) {
+  LLVM_DEBUG(dbgs() << "Getting callee context for instr: " << Inst << "\n");
+  // CSFDO-TODO: We use CalleeName to differentiate indirect call
+  // We need to get sample for indirect callee too.
+  DILocation *DIL = Inst.getDebugLoc();
+  if (!DIL)
+    return nullptr;
+
+  ContextTrieNode *CalleeContext = getCalleeContextFor(DIL, CalleeName);
+  if (CalleeContext) {
+    FunctionSamples *FSamples = CalleeContext->getFunctionSamples();
+    LLVM_DEBUG(if (FSamples) {
+      dbgs() << "  Callee context found: " << FSamples->getContext() << "\n";
+    });
+    return FSamples;
+  }
+
+  return nullptr;
+}
+
+FunctionSamples *
+SampleContextTracker::getContextSamplesFor(const DILocation *DIL) {
+  assert(DIL && "Expect non-null location");
+
+  ContextTrieNode *ContextNode = getContextFor(DIL);
+  if (!ContextNode)
+    return nullptr;
+
+  // We may have inlined callees during pre-LTO compilation, in which case
+  // we need to rely on the inline stack from !dbg to mark context profile
+  // as inlined, instead of `MarkContextSamplesInlined` during inlining.
+  // Sample profile loader walks through all instructions to get profile,
+  // which calls this function. So once that is done, all previously inlined
+  // context profile should be marked properly.
+  FunctionSamples *Samples = ContextNode->getFunctionSamples();
+  if (Samples && ContextNode->getParentContext() != &RootContext)
+    Samples->getContext().setState(InlinedContext);
+
+  return Samples;
+}
+
+FunctionSamples *
+SampleContextTracker::getContextSamplesFor(const SampleContext &Context) {
+  ContextTrieNode *Node = getContextFor(Context);
+  if (!Node)
+    return nullptr;
+
+  return Node->getFunctionSamples();
+}
+
+FunctionSamples *SampleContextTracker::getBaseSamplesFor(const Function &Func,
+                                                         bool MergeContext) {
+  StringRef CanonName = FunctionSamples::getCanonicalFnName(Func);
+  return getBaseSamplesFor(CanonName, MergeContext);
+}
+
+FunctionSamples *SampleContextTracker::getBaseSamplesFor(StringRef Name,
+                                                         bool MergeContext) {
+  LLVM_DEBUG(dbgs() << "Getting base profile for function: " << Name << "\n");
+  // Base profile is top-level node (child of root node), so try to retrieve
+  // existing top-level node for given function first. If it exists, it could be
+  // that we've merged base profile before, or there's actually context-less
+  // profile from the input (e.g. due to unreliable stack walking).
+  ContextTrieNode *Node = getTopLevelContextNode(Name);
+  if (MergeContext) {
+    LLVM_DEBUG(dbgs() << "  Merging context profile into base profile: " << Name
+                      << "\n");
+
+    // We have profile for function under different contexts,
+    // create synthetic base profile and merge context profiles
+    // into base profile.
+    for (auto *CSamples : FuncToCtxtProfileSet[Name]) {
+      SampleContext &Context = CSamples->getContext();
+      ContextTrieNode *FromNode = getContextFor(Context);
+      if (FromNode == Node)
+        continue;
+
+      // Skip inlined context profile and also don't re-merge any context
+      if (Context.hasState(InlinedContext) || Context.hasState(MergedContext))
+        continue;
+
+      ContextTrieNode &ToNode = promoteMergeContextSamplesTree(*FromNode);
+      assert(!Node || Node == &ToNode && "Expect only one base profile");
+      Node = &ToNode;
+    }
+  }
+
+  // Still no profile even after merge/promotion (if allowed)
+  if (!Node)
+    return nullptr;
+
+  return Node->getFunctionSamples();
+}
+
+void SampleContextTracker::markContextSamplesInlined(
+    const FunctionSamples *InlinedSamples) {
+  assert(InlinedSamples && "Expect non-null inlined samples");
+  LLVM_DEBUG(dbgs() << "Marking context profile as inlined: "
+                    << InlinedSamples->getContext() << "\n");
+  InlinedSamples->getContext().setState(InlinedContext);
+}
+
+void SampleContextTracker::promoteMergeContextSamplesTree(
+    const Instruction &Inst, StringRef CalleeName) {
+  LLVM_DEBUG(dbgs() << "Promoting and merging context tree for instr: \n"
+                    << Inst << "\n");
+  // CSFDO-TODO: We also need to promote context profile from indirect
+  // calls. We won't have callee names from those from call instr.
+  if (CalleeName.empty())
+    return;
+
+  // Get the caller context for the call instruction, we don't use callee
+  // name from call because there can be context from indirect calls too.
+  DILocation *DIL = Inst.getDebugLoc();
+  ContextTrieNode *CallerNode = getContextFor(DIL);
+  if (!CallerNode)
+    return;
+
+  // Get the context that needs to be promoted
+  LineLocation CallSite(FunctionSamples::getOffset(DIL),
+                        DIL->getBaseDiscriminator());
+  ContextTrieNode *NodeToPromo =
+      CallerNode->getChildContext(CallSite, CalleeName);
+  if (!NodeToPromo)
+    return;
+
+  promoteMergeContextSamplesTree(*NodeToPromo);
+}
+
+ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
+    ContextTrieNode &NodeToPromo) {
+  // Promote the input node to be directly under root. This can happen
+  // when we decided to not inline a function under context represented
+  // by the input node. The promote and merge is then needed to reflect
+  // the context profile in the base (context-less) profile.
+  FunctionSamples *FromSamples = NodeToPromo.getFunctionSamples();
+  assert(FromSamples && "Shouldn't promote a context without profile");
+  LLVM_DEBUG(dbgs() << "  Found context tree root to promote: "
+                    << FromSamples->getContext() << "\n");
+
+  StringRef ContextStrToRemove = FromSamples->getContext().getCallingContext();
+  return promoteMergeContextSamplesTree(NodeToPromo, RootContext,
+                                        ContextStrToRemove);
+}
+
+void SampleContextTracker::dump() {
+  dbgs() << "Context Profile Tree:\n";
+  std::queue<ContextTrieNode *> NodeQueue;
+  NodeQueue.push(&RootContext);
+
+  while (!NodeQueue.empty()) {
+    ContextTrieNode *Node = NodeQueue.front();
+    NodeQueue.pop();
+    Node->dump();
+
+    for (auto &It : Node->getAllChildContext()) {
+      ContextTrieNode *ChildNode = &It.second;
+      NodeQueue.push(ChildNode);
+    }
+  }
+}
+
+ContextTrieNode *
+SampleContextTracker::getContextFor(const SampleContext &Context) {
+  return getOrCreateContextPath(Context, false);
+}
+
+ContextTrieNode *
+SampleContextTracker::getCalleeContextFor(const DILocation *DIL,
+                                          StringRef CalleeName) {
+  assert(DIL && "Expect non-null location");
+
+  // CSSPGO-TODO: need to support indirect callee
+  if (CalleeName.empty())
+    return nullptr;
+
+  ContextTrieNode *CallContext = getContextFor(DIL);
+  if (!CallContext)
+    return nullptr;
+
+  return CallContext->getChildContext(
+      LineLocation(FunctionSamples::getOffset(DIL),
+                   DIL->getBaseDiscriminator()),
+      CalleeName);
+}
+
+ContextTrieNode *SampleContextTracker::getContextFor(const DILocation *DIL) {
+  assert(DIL && "Expect non-null location");
+  SmallVector<std::pair<LineLocation, StringRef>, 10> S;
+
+  // Use C++ linkage name if possible.
+  const DILocation *PrevDIL = DIL;
+  for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) {
+    StringRef Name = PrevDIL->getScope()->getSubprogram()->getLinkageName();
+    if (Name.empty())
+      Name = PrevDIL->getScope()->getSubprogram()->getName();
+    S.push_back(
+        std::make_pair(LineLocation(FunctionSamples::getOffset(DIL),
+                                    DIL->getBaseDiscriminator()), Name));
+    PrevDIL = DIL;
+  }
+
+  // Push root node, note that root node like main may only
+  // a name, but not linkage name.
+  StringRef RootName = PrevDIL->getScope()->getSubprogram()->getLinkageName();
+  if (RootName.empty())
+    RootName = PrevDIL->getScope()->getSubprogram()->getName();
+  S.push_back(std::make_pair(LineLocation(0, 0), RootName));
+
+  ContextTrieNode *ContextNode = &RootContext;
+  int I = S.size();
+  while (--I >= 0 && ContextNode) {
+    LineLocation &CallSite = S[I].first;
+    StringRef &CalleeName = S[I].second;
+    ContextNode = ContextNode->getChildContext(CallSite, CalleeName);
+  }
+
+  if (I < 0)
+    return ContextNode;
+
+  return nullptr;
+}
+
+ContextTrieNode *
+SampleContextTracker::getOrCreateContextPath(const SampleContext &Context,
+                                             bool AllowCreate) {
+  ContextTrieNode *ContextNode = &RootContext;
+  StringRef ContextRemain = Context;
+  StringRef ChildContext;
+  StringRef CalleeName;
+  LineLocation CallSiteLoc(0, 0);
+
+  while (ContextNode && !ContextRemain.empty()) {
+    auto ContextSplit = SampleContext::splitContextString(ContextRemain);
+    ChildContext = ContextSplit.first;
+    ContextRemain = ContextSplit.second;
+    LineLocation NextCallSiteLoc(0, 0);
+    SampleContext::decodeContextString(ChildContext, CalleeName,
+                                       NextCallSiteLoc);
+
+    // Create child node at parent line/disc location
+    if (AllowCreate) {
+      ContextNode =
+          ContextNode->getOrCreateChildContext(CallSiteLoc, CalleeName);
+    } else {
+      ContextNode = ContextNode->getChildContext(CallSiteLoc, CalleeName);
+    }
+    CallSiteLoc = NextCallSiteLoc;
+  }
+
+  assert((!AllowCreate || ContextNode) &&
+         "Node must exist if creation is allowed");
+  return ContextNode;
+}
+
+ContextTrieNode *SampleContextTracker::getTopLevelContextNode(StringRef FName) {
+  return RootContext.getChildContext(LineLocation(0, 0), FName);
+}
+
+ContextTrieNode &SampleContextTracker::addTopLevelContextNode(StringRef FName) {
+  assert(!getTopLevelContextNode(FName) && "Node to add must not exist");
+  return *RootContext.getOrCreateChildContext(LineLocation(0, 0), FName);
+}
+
+void SampleContextTracker::mergeContextNode(ContextTrieNode &FromNode,
+                                            ContextTrieNode &ToNode,
+                                            StringRef ContextStrToRemove) {
+  FunctionSamples *FromSamples = FromNode.getFunctionSamples();
+  FunctionSamples *ToSamples = ToNode.getFunctionSamples();
+  if (FromSamples && ToSamples) {
+    // Merge/duplicate FromSamples into ToSamples
+    ToSamples->merge(*FromSamples);
+    ToSamples->getContext().setState(SyntheticContext);
+    FromSamples->getContext().setState(MergedContext);
+  } else if (FromSamples) {
+    // Transfer FromSamples from FromNode to ToNode
+    ToNode.setFunctionSamples(FromSamples);
+    FromSamples->getContext().setState(SyntheticContext);
+    FromSamples->getContext().promoteOnPath(ContextStrToRemove);
+    FromNode.setFunctionSamples(nullptr);
+  }
+}
+
+ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
+    ContextTrieNode &FromNode, ContextTrieNode &ToNodeParent,
+    StringRef ContextStrToRemove) {
+  assert(!ContextStrToRemove.empty() && "Context to remove can't be empty");
+
+  // Ignore call site location if destination is top level under root
+  LineLocation NewCallSiteLoc = LineLocation(0, 0);
+  LineLocation OldCallSiteLoc = FromNode.getCallSiteLoc();
+  ContextTrieNode &FromNodeParent = *FromNode.getParentContext();
+  ContextTrieNode *ToNode = nullptr;
+  bool MoveToRoot = (&ToNodeParent == &RootContext);
+  if (!MoveToRoot) {
+    NewCallSiteLoc = OldCallSiteLoc;
+  }
+
+  // Locate destination node, create/move if not existing
+  ToNode = ToNodeParent.getChildContext(NewCallSiteLoc, FromNode.getFuncName());
+  if (!ToNode) {
+    // Do not delete node to move from its parent here because
+    // caller is iterating over children of that parent node.
+    ToNode = &ToNodeParent.moveToChildContext(
+        NewCallSiteLoc, std::move(FromNode), ContextStrToRemove, false);
+  } else {
+    // Destination node exists, merge samples for the context tree
+    mergeContextNode(FromNode, *ToNode, ContextStrToRemove);
+    LLVM_DEBUG(dbgs() << "  Context promoted and merged to: "
+                      << ToNode->getFunctionSamples()->getContext() << "\n");
+
+    // Recursively promote and merge children
+    for (auto &It : FromNode.getAllChildContext()) {
+      ContextTrieNode &FromChildNode = It.second;
+      promoteMergeContextSamplesTree(FromChildNode, *ToNode,
+                                     ContextStrToRemove);
+    }
+
+    // Remove children once they're all merged
+    FromNode.getAllChildContext().clear();
+  }
+
+  // For root of subtree, remove itself from old parent too
+  if (MoveToRoot)
+    FromNodeParent.removeChildContext(OldCallSiteLoc, ToNode->getFuncName());
+
+  return *ToNode;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -76,6 +76,7 @@
 #include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/SampleContextTracker.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -424,6 +425,9 @@
   /// Profile reader object.
   std::unique_ptr<SampleProfileReader> Reader;
 
+  /// Profile tracker for different context.
+  std::unique_ptr<SampleContextTracker> ContextTracker;
+
   /// Samples collected for the body of this function.
   FunctionSamples *Samples = nullptr;
 
@@ -436,6 +440,9 @@
   /// Flag indicating whether the profile input loaded successfully.
   bool ProfileIsValid = false;
 
+  /// Flag indicating whether input profile is context-sensitive
+  bool ProfileIsCS = false;
+
   /// Flag indicating if the pass is invoked in ThinLTO compile phase.
   ///
   /// In this phase, in annotation, we should not promote indirect calls.
@@ -733,9 +740,10 @@
   // (findCalleeFunctionSamples returns non-empty result), but not inlined here,
   // it means that the inlined callsite has no sample, thus the call
   // instruction should have 0 count.
-  if (auto *CB = dyn_cast<CallBase>(&Inst))
-    if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
-      return 0;
+  if (!ProfileIsCS)
+    if (const auto *CB = dyn_cast<CallBase>(&Inst))
+      if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
+        return 0;
 
   const DILocation *DIL = DLoc;
   uint32_t LineOffset = FunctionSamples::getOffset(DIL);
@@ -831,7 +839,10 @@
 
   StringRef CalleeName;
   if (Function *Callee = Inst.getCalledFunction())
-    CalleeName = Callee->getName();
+    CalleeName = FunctionSamples::getCanonicalFnName(*Callee);
+
+  if (ProfileIsCS)
+    return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
 
   const FunctionSamples *FS = findFunctionSamples(Inst);
   if (FS == nullptr)
@@ -901,8 +912,13 @@
     return Samples;
 
   auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
-  if (it.second)
-    it.first->second = Samples->findFunctionSamples(DIL, Reader->getRemapper());
+  if (it.second) {
+    if (ProfileIsCS)
+      it.first->second = ContextTracker->getContextSamplesFor(DIL);
+    else
+      it.first->second =
+          Samples->findFunctionSamples(DIL, Reader->getRemapper());
+  }
   return it.first->second;
 }
 
@@ -957,6 +973,12 @@
   InlineCost Cost = getInlineCost(CallInst, getInlineParams(), GetTTI(*Callee),
                                   GetAC, GetTLI);
 
+  if (Cost.isNever())
+    return false;
+
+  if (Cost.isAlways())
+    return true;
+
   return Cost.getCost() <= SampleColdCallSiteThreshold;
 }
 
@@ -1017,7 +1039,7 @@
             assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
                    "GUIDToFuncNameMap has to be populated");
             AllCandidates.push_back(CB);
-            if (FS->getEntrySamples() > 0)
+            if (FS->getEntrySamples() > 0 || ProfileIsCS)
               localNotInlinedCallSites.try_emplace(CB, FS);
             if (callsiteIsHot(FS, PSI))
               Hot = true;
@@ -1075,6 +1097,8 @@
             // If profile mismatches, we should not attempt to inline DI.
             if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) &&
                 inlineCallInstruction(cast<CallBase>(DI))) {
+              if (ProfileIsCS)
+                ContextTracker->markContextSamplesInlined(FS);
               localNotInlinedCallSites.erase(I);
               LocalChanged = true;
               ++NumCSInlined;
@@ -1088,6 +1112,9 @@
       } else if (CalledFunction && CalledFunction->getSubprogram() &&
                  !CalledFunction->isDeclaration()) {
         if (inlineCallInstruction(*I)) {
+          if (ProfileIsCS)
+            ContextTracker->markContextSamplesInlined(
+                localNotInlinedCallSites[I]);
           localNotInlinedCallSites.erase(I);
           LocalChanged = true;
           ++NumCSInlined;
@@ -1875,6 +1902,16 @@
       ExternalInlineAdvisor.reset();
   }
 
+  // Apply tweaks if context-sensitive profile is available.
+  if (Reader->profileIsCS()) {
+    ProfileIsCS = true;
+    FunctionSamples::ProfileIsCS = true;
+
+    // Tracker for profiles under different context
+    ContextTracker =
+        std::make_unique<SampleContextTracker>(Reader->getProfiles());
+  }
+
   return true;
 }
 
@@ -1940,9 +1977,10 @@
   }
 
   // Account for cold calls not inlined....
-  for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
-       notInlinedCallInfo)
-    updateProfileCallee(pair.first, pair.second.entryCount);
+  if (!ProfileIsCS)
+    for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
+         notInlinedCallInfo)
+      updateProfileCallee(pair.first, pair.second.entryCount);
 
   return retval;
 }
@@ -1957,7 +1995,6 @@
 }
 
 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
-
   DILocation2SampleMap.clear();
   // By default the entry count is initialized to -1, which will be treated
   // conservatively by getEntryCount as the same as unknown (None). This is
@@ -2010,7 +2047,12 @@
     OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
     ORE = OwnedORE.get();
   }
-  Samples = Reader->getSamplesFor(F);
+
+  if (ProfileIsCS)
+    Samples = ContextTracker->getBaseSamplesFor(F);
+  else
+    Samples = Reader->getSamplesFor(F);
+
   if (Samples && !Samples->empty())
     return emitAnnotations(F);
   return false;
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-context-tracker.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-context-tracker.prof
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-context-tracker.prof
@@ -0,0 +1,36 @@
+[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]:1467299:11
+ 0: 6
+ 1: 6
+ 3: 287884
+ 4: 287864 _Z3fibi:315608
+ 15: 23
+[main:3.1 @ _Z5funcBi:1 @ _Z8funcLeafi]:500853:20
+ 0: 15
+ 1: 15
+ 3: 74946
+ 4: 74941 _Z3fibi:82359
+ 10: 23324
+ 11: 23327 _Z3fibi:25228
+ 15: 11
+[main]:154:0
+ 2: 12
+ 3: 18 _Z5funcAi:11
+ 3.1: 18 _Z5funcBi:19
+[external:12 @ main]:154:12
+ 2: 12
+ 3: 10 _Z5funcAi:7
+ 3.1: 10 _Z5funcBi:11
+[main:3.1 @ _Z5funcBi]:120:19
+ 0: 19
+ 1: 19 _Z8funcLeafi:20
+ 3: 12
+[externalA:17 @ _Z5funcBi]:120:3
+ 0: 3
+ 1: 3
+[external:10 @ _Z5funcBi]:120:10
+ 0: 10
+ 1: 10
+[main:3 @ _Z5funcAi]:99:11
+ 0: 10
+ 1: 10 _Z8funcLeafi:11
+ 3: 24
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
@@ -0,0 +1,234 @@
+; REQUIRES: asserts
+; Test for CSSPGO's SampleContextTracker to make sure context profile tree is promoted and merged properly
+; based on inline decision, so post inline counts are accurate.
+
+; Note that we need new pass manager to enable top-down processing for sample profile loader
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-ALL
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-HOT
+
+
+; Testwe we inlined the following in top-down order and promot rest not inlined context profile into base profile
+;   main:3 @ _Z5funcAi
+;   main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
+;   _Z5funcBi:1 @ _Z8funcLeafi
+; INLINE-ALL:      Getting base profile for function: main
+; INLINE-ALL-NEXT:   Merging context profile into base profile: main
+; INLINE-ALL-NEXT:   Found context tree root to promote: external:12 @ main
+; INLINE-ALL-NEXT:   Context promoted and merged to: main
+; INLINE-ALL-NEXT: Getting callee context for instr:   %call = tail call i32 @_Z5funcBi
+; INLINE-ALL-NEXT:   Callee context found: main:3.1 @ _Z5funcBi
+; INLINE-ALL-NEXT: Getting callee context for instr:   %call1 = tail call i32 @_Z5funcAi
+; INLINE-ALL-NEXT:   Callee context found: main:3 @ _Z5funcAi
+; INLINE-ALL-NEXT: Marking context profile as inlined: main:3 @ _Z5funcAi
+; INLINE-ALL-NEXT: Getting callee context for instr:   %call = tail call i32 @_Z5funcBi(
+; INLINE-ALL-NEXT:   Callee context found: main:3.1 @ _Z5funcBi
+; INLINE-ALL-NEXT: Getting callee context for instr:   %call.i = tail call i32 @_Z8funcLeafi
+; INLINE-ALL-NEXT:   Callee context found: main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
+; INLINE-ALL-NEXT: Marking context profile as inlined: main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
+; INLINE-ALL-NEXT: Getting callee context for instr:   %call = tail call i32 @_Z5funcBi
+; INLINE-ALL-NEXT:   Callee context found: main:3.1 @ _Z5funcBi
+; INLINE-ALL-NEXT: Getting callee context for instr:   %call.i1 = tail call i32 @_Z3fibi
+; INLINE-ALL-NEXT: Getting callee context for instr:   %call5.i = tail call i32 @_Z3fibi
+; INLINE-ALL-NEXT: Getting base profile for function: _Z5funcAi
+; INLINE-ALL-NEXT:   Merging context profile into base profile: _Z5funcAi
+; INLINE-ALL-NEXT: Getting base profile for function: _Z5funcBi
+; INLINE-ALL-NEXT:   Merging context profile into base profile: _Z5funcBi
+; INLINE-ALL-NEXT:   Found context tree root to promote: external:10 @ _Z5funcBi
+; INLINE-ALL-NEXT:   Context promoted to: _Z5funcBi
+; INLINE-ALL-NEXT:   Found context tree root to promote: main:3.1 @ _Z5funcBi
+; INLINE-ALL-NEXT:   Context promoted and merged to: _Z5funcBi
+; INLINE-ALL-NEXT:   Context promoted to: _Z5funcBi:1 @ _Z8funcLeafi
+; INLINE-ALL-NEXT:   Found context tree root to promote: externalA:17 @ _Z5funcBi
+; INLINE-ALL-NEXT:   Context promoted and merged to: _Z5funcBi
+; INLINE-ALL-NEXT: Getting callee context for instr:   %call = tail call i32 @_Z8funcLeafi
+; INLINE-ALL-NEXT:   Callee context found: _Z5funcBi:1 @ _Z8funcLeafi
+; INLINE-ALL-NEXT: Marking context profile as inlined: _Z5funcBi:1 @ _Z8funcLeafi
+; INLINE-ALL-NEXT: Getting callee context for instr:   %call.i = tail call i32 @_Z3fibi
+; INLINE-ALL-NEXT: Getting callee context for instr:   %call5.i = tail call i32 @_Z3fibi
+; INLINE-ALL-NEXT: Getting base profile for function: _Z8funcLeafi
+; INLINE-ALL-NEXT:   Merging context profile into base profile: _Z8funcLeafi
+
+; Testwe we inlined the following in top-down order and promot rest not inlined context profile into base profile
+;   main:3 @ _Z5funcAi
+;   _Z5funcAi:1 @ _Z8funcLeafi
+;   _Z5funcBi:1 @ _Z8funcLeafi
+; INLINE-HOT:      Getting base profile for function: main
+; INLINE-HOT-NEXT:   Merging context profile into base profile: main
+; INLINE-HOT-NEXT:   Found context tree root to promote: external:12 @ main
+; INLINE-HOT-NEXT:   Context promoted and merged to: main
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !58
+; INLINE-HOT-NEXT:   Callee context found: main:3.1 @ _Z5funcBi
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !63
+; INLINE-HOT-NEXT:   Callee context found: main:3 @ _Z5funcAi
+; INLINE-HOT-NEXT: Getting base profile for function: _Z5funcAi
+; INLINE-HOT-NEXT:   Merging context profile into base profile: _Z5funcAi
+; INLINE-HOT-NEXT:   Found context tree root to promote: main:3 @ _Z5funcAi
+; INLINE-HOT-NEXT:   Context promoted to: _Z5funcAi
+; INLINE-HOT-NEXT:   Context promoted to: _Z5funcAi:1 @ _Z8funcLeafi
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !50
+; INLINE-HOT-NEXT:   Callee context found: _Z5funcAi:1 @ _Z8funcLeafi
+; INLINE-HOT-NEXT: Marking context profile as inlined: _Z5funcAi:1 @ _Z8funcLeafi
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call.i = tail call i32 @_Z3fibi(i32 %tmp.i) #2, !dbg !62
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call5.i = tail call i32 @_Z3fibi(i32 %tmp1.i) #2, !dbg !69
+; INLINE-HOT-NEXT: Getting base profile for function: _Z5funcBi
+; INLINE-HOT-NEXT:   Merging context profile into base profile: _Z5funcBi
+; INLINE-HOT-NEXT:   Found context tree root to promote: external:10 @ _Z5funcBi
+; INLINE-HOT-NEXT:   Context promoted to: _Z5funcBi
+; INLINE-HOT-NEXT:   Found context tree root to promote: main:3.1 @ _Z5funcBi
+; INLINE-HOT-NEXT:   Context promoted and merged to: _Z5funcBi
+; INLINE-HOT-NEXT:   Context promoted to: _Z5funcBi:1 @ _Z8funcLeafi
+; INLINE-HOT-NEXT:   Found context tree root to promote: externalA:17 @ _Z5funcBi
+; INLINE-HOT-NEXT:   Context promoted and merged to: _Z5funcBi
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !50
+; INLINE-HOT-NEXT:   Callee context found: _Z5funcBi:1 @ _Z8funcLeafi
+; INLINE-HOT-NEXT: Marking context profile as inlined: _Z5funcBi:1 @ _Z8funcLeafi
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call.i = tail call i32 @_Z3fibi(i32 %tmp.i) #2, !dbg !62
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call5.i = tail call i32 @_Z3fibi(i32 %tmp1.i) #2, !dbg !69
+; INLINE-HOT-NEXT: Getting base profile for function: _Z8funcLeafi
+; INLINE-HOT-NEXT:   Merging context profile into base profile: _Z8funcLeafi
+
+
+@factor = dso_local global i32 3, align 4, !dbg !0
+
+define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {
+entry:
+  br label %for.body, !dbg !25
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add3, !dbg !27
+
+for.body:                                         ; preds = %for.body, %entry
+  %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ]
+  %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32
+  %add = add nuw nsw i32 %x.011, 1, !dbg !31
+  %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28
+  %add2 = add i32 %call, %r.010, !dbg !34
+  %add3 = add i32 %add2, %call1, !dbg !35
+  %dec = add nsw i32 %x.011, -1, !dbg !36
+  %cmp = icmp eq i32 %x.011, 0, !dbg !38
+  br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25
+}
+
+define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 {
+entry:
+  %add = add nsw i32 %x, 100000, !dbg !44
+  %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45
+  ret i32 %call, !dbg !46
+}
+
+define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 {
+entry:
+  %cmp = icmp sgt i32 %x, 0, !dbg !57
+  br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59
+
+while.cond2.preheader:                            ; preds = %entry
+  %cmp313 = icmp slt i32 %x, 0, !dbg !60
+  br i1 %cmp313, label %while.body4, label %if.end, !dbg !63
+
+while.body:                                       ; preds = %while.body, %entry
+  %x.addr.016 = phi i32 [ %sub, %while.body ], [ %x, %entry ]
+  %tmp = load volatile i32, i32* @factor, align 4, !dbg !64
+  %call = tail call i32 @_Z3fibi(i32 %tmp), !dbg !67
+  %sub = sub nsw i32 %x.addr.016, %call, !dbg !68
+  %cmp1 = icmp sgt i32 %sub, 0, !dbg !69
+  br i1 %cmp1, label %while.body, label %if.end, !dbg !71
+
+while.body4:                                      ; preds = %while.body4, %while.cond2.preheader
+  %x.addr.114 = phi i32 [ %add, %while.body4 ], [ %x, %while.cond2.preheader ]
+  %tmp1 = load volatile i32, i32* @factor, align 4, !dbg !72
+  %call5 = tail call i32 @_Z3fibi(i32 %tmp1), !dbg !74
+  %add = add nsw i32 %call5, %x.addr.114, !dbg !75
+  %cmp3 = icmp slt i32 %add, 0, !dbg !60
+  br i1 %cmp3, label %while.body4, label %if.end, !dbg !63
+
+if.end:                                           ; preds = %while.body4, %while.body, %while.cond2.preheader
+  %x.addr.2 = phi i32 [ 0, %while.cond2.preheader ], [ %sub, %while.body ], [ %add, %while.body4 ]
+  ret i32 %x.addr.2, !dbg !76
+}
+
+define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 {
+entry:
+  %sub = add nsw i32 %x, -100000, !dbg !51
+  %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52
+  ret i32 %call, !dbg !53
+}
+
+declare i32 @_Z3fibi(i32)
+
+attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo")
+!4 = !{}
+!5 = !{!6, !10, !11}
+!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!12 = !{!0}
+!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9)
+!14 = !{i32 7, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !{i32 1, !"wchar_size", i32 4}
+!17 = !{!"clang version 11.0.0"}
+!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
+!19 = !DISubroutineType(types: !20)
+!20 = !{!9}
+!21 = !{!22, !23}
+!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9)
+!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9)
+!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3)
+!25 = !DILocation(line: 13, column: 3, scope: !26)
+!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2)
+!27 = !DILocation(line: 17, column: 3, scope: !18)
+!28 = !DILocation(line: 14, column: 10, scope: !29)
+!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37)
+!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3)
+!31 = !DILocation(line: 14, column: 29, scope: !29)
+!32 = !DILocation(line: 14, column: 21, scope: !33)
+!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
+!34 = !DILocation(line: 14, column: 19, scope: !29)
+!35 = !DILocation(line: 14, column: 7, scope: !29)
+!36 = !DILocation(line: 13, column: 33, scope: !37)
+!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6)
+!38 = !DILocation(line: 13, column: 26, scope: !39)
+!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2)
+!40 = distinct !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 26, type: !7, scopeLine: 26, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!44 = !DILocation(line: 27, column: 22, scope: !40)
+!45 = !DILocation(line: 27, column: 11, scope: !40)
+!46 = !DILocation(line: 29, column: 3, scope: !40)
+!47 = distinct !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!51 = !DILocation(line: 33, column: 22, scope: !47)
+!52 = !DILocation(line: 33, column: 11, scope: !47)
+!53 = !DILocation(line: 35, column: 3, scope: !47)
+!54 = distinct !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 48, type: !7, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!57 = !DILocation(line: 49, column: 9, scope: !58)
+!58 = distinct !DILexicalBlock(scope: !54, file: !3, line: 49, column: 7)
+!59 = !DILocation(line: 49, column: 7, scope: !54)
+!60 = !DILocation(line: 58, column: 14, scope: !61)
+!61 = !DILexicalBlockFile(scope: !62, file: !3, discriminator: 2)
+!62 = distinct !DILexicalBlock(scope: !58, file: !3, line: 56, column: 8)
+!63 = !DILocation(line: 58, column: 5, scope: !61)
+!64 = !DILocation(line: 52, column: 16, scope: !65)
+!65 = distinct !DILexicalBlock(scope: !66, file: !3, line: 51, column: 19)
+!66 = distinct !DILexicalBlock(scope: !58, file: !3, line: 49, column: 14)
+!67 = !DILocation(line: 52, column: 12, scope: !65)
+!68 = !DILocation(line: 52, column: 9, scope: !65)
+!69 = !DILocation(line: 51, column: 14, scope: !70)
+!70 = !DILexicalBlockFile(scope: !66, file: !3, discriminator: 2)
+!71 = !DILocation(line: 51, column: 5, scope: !70)
+!72 = !DILocation(line: 59, column: 16, scope: !73)
+!73 = distinct !DILexicalBlock(scope: !62, file: !3, line: 58, column: 19)
+!74 = !DILocation(line: 59, column: 12, scope: !73)
+!75 = !DILocation(line: 59, column: 9, scope: !73)
+!76 = !DILocation(line: 63, column: 3, scope: !54)
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
@@ -0,0 +1,197 @@
+; Test for CSSPGO's SampleContextTracker to make sure context profile tree is promoted and merged properly
+; based on inline decision, so post inline counts are accurate.
+
+; Note that we need new pass manager to enable top-down processing for sample profile loader
+; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile
+;   main:3 @ _Z5funcAi
+;   main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
+;   _Z5funcBi:1 @ _Z8funcLeafi
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
+
+; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile
+;   main:3 @ _Z5funcAi
+;   _Z5funcAi:1 @ _Z8funcLeafi
+;   _Z5funcBi:1 @ _Z8funcLeafi
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT
+
+
+@factor = dso_local global i32 3, align 4, !dbg !0
+
+define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {
+; INLINE-ALL: @main{{.*}}!prof ![[MAIN_PROF:[0-9]+]]
+; INLINE-HOT: @main{{.*}}!prof ![[MAIN_PROF:[0-9]+]]
+entry:
+  br label %for.body, !dbg !25
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add3, !dbg !27
+
+for.body:                                         ; preds = %for.body, %entry
+  %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ]
+  %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32
+; _Z5funcBi is marked noinline
+; INLINE-ALL: call i32 @_Z5funcBi
+; INLINE-HOT: call i32 @_Z5funcBi
+  %add = add nuw nsw i32 %x.011, 1, !dbg !31
+  %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28
+; INLINE-ALL-NOT: call i32 @_Z5funcAi
+; INLINE-HOT: call i32 @_Z5funcAi
+  %add2 = add i32 %call, %r.010, !dbg !34
+  %add3 = add i32 %add2, %call1, !dbg !35
+  %dec = add nsw i32 %x.011, -1, !dbg !36
+  %cmp = icmp eq i32 %x.011, 0, !dbg !38
+  br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25
+}
+
+define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 {
+; _Z5funcAi is inlined, so outline remainder should have zero counts
+; INLINE-ALL: @_Z5funcAi{{.*}}!prof ![[FUNCA_PROF:[0-9]+]]
+; INLINE-HOT: @_Z5funcAi{{.*}}!prof ![[FUNCA_PROF:[0-9]+]]
+entry:
+  %add = add nsw i32 %x, 100000, !dbg !44
+; _Z8funcLeafi is already inlined on main->_Z5funcAi->_Z8funcLeafi,
+; so it should not be inlined on _Z5funcAi->_Z8funcLeafi based on updated
+; (merged and promoted) context profile
+; INLINE-ALL: call i32 @_Z8funcLeafi
+; INLINE-HOT-NOT: call i32 @_Z8funcLeafi
+  %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45
+  ret i32 %call, !dbg !46
+}
+
+define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 {
+; main->_Z5funcAi->_Z8funcLeafi is inlined, and _Z5funcBi->_Z8funcLeafi is also
+; inlined, so outline remainder should have empty profile
+; INLINE-ALL: @_Z8funcLeafi{{.*}}!prof ![[LEAF_PROF:[0-9]+]]
+; INLINE-HOT: @_Z8funcLeafi{{.*}}!prof ![[LEAF_PROF:[0-9]+]]
+entry:
+  %cmp = icmp sgt i32 %x, 0, !dbg !57
+  br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59
+
+while.cond2.preheader:                            ; preds = %entry
+  %cmp313 = icmp slt i32 %x, 0, !dbg !60
+  br i1 %cmp313, label %while.body4, label %if.end, !dbg !63
+
+while.body:                                       ; preds = %while.body, %entry
+  %x.addr.016 = phi i32 [ %sub, %while.body ], [ %x, %entry ]
+  %tmp = load volatile i32, i32* @factor, align 4, !dbg !64
+  %call = tail call i32 @_Z3fibi(i32 %tmp), !dbg !67
+  %sub = sub nsw i32 %x.addr.016, %call, !dbg !68
+  %cmp1 = icmp sgt i32 %sub, 0, !dbg !69
+  br i1 %cmp1, label %while.body, label %if.end, !dbg !71
+
+while.body4:                                      ; preds = %while.body4, %while.cond2.preheader
+  %x.addr.114 = phi i32 [ %add, %while.body4 ], [ %x, %while.cond2.preheader ]
+  %tmp1 = load volatile i32, i32* @factor, align 4, !dbg !72
+  %call5 = tail call i32 @_Z3fibi(i32 %tmp1), !dbg !74
+  %add = add nsw i32 %call5, %x.addr.114, !dbg !75
+  %cmp3 = icmp slt i32 %add, 0, !dbg !60
+  br i1 %cmp3, label %while.body4, label %if.end, !dbg !63
+
+if.end:                                           ; preds = %while.body4, %while.body, %while.cond2.preheader
+  %x.addr.2 = phi i32 [ 0, %while.cond2.preheader ], [ %sub, %while.body ], [ %add, %while.body4 ]
+  ret i32 %x.addr.2, !dbg !76
+}
+
+define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 {
+; _Z5funcBi is marked noinline, so outline remainder has promoted context profile
+; INLINE-ALL: @_Z5funcBi{{.*}}!prof ![[FUNCB_PROF:[0-9]+]]
+; INLINE-HOT: @_Z5funcBi{{.*}}!prof ![[FUNCB_PROF:[0-9]+]]
+entry:
+  %sub = add nsw i32 %x, -100000, !dbg !51
+  %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52
+; _Z5funcBi is not inlined into main, so we main->_Z5funcBi->_Z8funcLeafi
+; should be inlined based on promoted context profile
+; INLINE-ALL-NOT: call i32 @_Z8funcLeafi
+; INLINE-HOT-NOT: call i32 @_Z8funcLeafi
+  ret i32 %call, !dbg !53
+}
+
+; INLINE-ALL-DAG: [[MAIN_PROF]] = !{!"function_entry_count", i64 13}
+; INLINE-ALL-DAG: [[FUNCA_PROF]] = !{!"function_entry_count", i64 0}
+; INLINE-ALL-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 0}
+; INLINE-ALL-DAG: [[FUNCB_PROF]] = !{!"function_entry_count", i64 33}
+
+; INLINE-HOT-DAG: [[MAIN_PROF]] = !{!"function_entry_count", i64 13}
+; INLINE-HOT-DAG: [[FUNCA_PROF]] = !{!"function_entry_count", i64 12}
+; INLINE-HOT-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 0}
+; INLINE-HOT-DAG: [[FUNCB_PROF]] = !{!"function_entry_count", i64 33}
+
+declare i32 @_Z3fibi(i32)
+
+attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo")
+!4 = !{}
+!5 = !{!6, !10, !11}
+!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!12 = !{!0}
+!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9)
+!14 = !{i32 7, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !{i32 1, !"wchar_size", i32 4}
+!17 = !{!"clang version 11.0.0"}
+!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
+!19 = !DISubroutineType(types: !20)
+!20 = !{!9}
+!21 = !{!22, !23}
+!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9)
+!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9)
+!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3)
+!25 = !DILocation(line: 13, column: 3, scope: !26)
+!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2)
+!27 = !DILocation(line: 17, column: 3, scope: !18)
+!28 = !DILocation(line: 14, column: 10, scope: !29)
+!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37)
+!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3)
+!31 = !DILocation(line: 14, column: 29, scope: !29)
+!32 = !DILocation(line: 14, column: 21, scope: !33)
+!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
+!34 = !DILocation(line: 14, column: 19, scope: !29)
+!35 = !DILocation(line: 14, column: 7, scope: !29)
+!36 = !DILocation(line: 13, column: 33, scope: !37)
+!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6)
+!38 = !DILocation(line: 13, column: 26, scope: !39)
+!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2)
+!40 = distinct !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 26, type: !7, scopeLine: 26, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!44 = !DILocation(line: 27, column: 22, scope: !40)
+!45 = !DILocation(line: 27, column: 11, scope: !40)
+!46 = !DILocation(line: 29, column: 3, scope: !40)
+!47 = distinct !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!51 = !DILocation(line: 33, column: 22, scope: !47)
+!52 = !DILocation(line: 33, column: 11, scope: !47)
+!53 = !DILocation(line: 35, column: 3, scope: !47)
+!54 = distinct !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 48, type: !7, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!57 = !DILocation(line: 49, column: 9, scope: !58)
+!58 = distinct !DILexicalBlock(scope: !54, file: !3, line: 49, column: 7)
+!59 = !DILocation(line: 49, column: 7, scope: !54)
+!60 = !DILocation(line: 58, column: 14, scope: !61)
+!61 = !DILexicalBlockFile(scope: !62, file: !3, discriminator: 2)
+!62 = distinct !DILexicalBlock(scope: !58, file: !3, line: 56, column: 8)
+!63 = !DILocation(line: 58, column: 5, scope: !61)
+!64 = !DILocation(line: 52, column: 16, scope: !65)
+!65 = distinct !DILexicalBlock(scope: !66, file: !3, line: 51, column: 19)
+!66 = distinct !DILexicalBlock(scope: !58, file: !3, line: 49, column: 14)
+!67 = !DILocation(line: 52, column: 12, scope: !65)
+!68 = !DILocation(line: 52, column: 9, scope: !65)
+!69 = !DILocation(line: 51, column: 14, scope: !70)
+!70 = !DILexicalBlockFile(scope: !66, file: !3, discriminator: 2)
+!71 = !DILocation(line: 51, column: 5, scope: !70)
+!72 = !DILocation(line: 59, column: 16, scope: !73)
+!73 = distinct !DILexicalBlock(scope: !62, file: !3, line: 58, column: 19)
+!74 = !DILocation(line: 59, column: 12, scope: !73)
+!75 = !DILocation(line: 59, column: 9, scope: !73)
+!76 = !DILocation(line: 63, column: 3, scope: !54)