diff --git a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
--- a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
+++ b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
@@ -13,6 +13,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ProfileData/SampleProf.h"
+#include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/Transforms/IPO/SampleContextTracker.h"
 #include <queue>
 #include <set>
@@ -40,40 +41,43 @@
 class ProfiledCallGraph {
 public:
   using iterator = std::set<ProfiledCallGraphNode *>::iterator;
-  ProfiledCallGraph(StringMap<FunctionSamples> &ProfileMap,
-                    SampleContextTracker &ContextTracker) {
-    // Add all profiled functions into profiled call graph.
-    // We only add function with actual context profile
-    for (auto &FuncSample : ProfileMap) {
-      FunctionSamples *FSamples = &FuncSample.second;
-      addProfiledFunction(FSamples->getName());
+
+  // Constructor for non-CS profile.
+  ProfiledCallGraph(StringMap<FunctionSamples> &ProfileMap) {
+    assert(!FunctionSamples::ProfileIsCS && "CS profile is not handled here");
+    for (const auto &Samples : ProfileMap) {
+      addProfiledCalls(Samples.second);
     }
+  }
 
-    // BFS traverse the context profile trie to add call edges for
-    // both samples calls as well as calls shown in context.
+  // Constructor for CS profile.
+  ProfiledCallGraph(SampleContextTracker &ContextTracker,
+                    bool AddNodeWithSamplesOnly = false) {
+    // BFS traverse the context profile trie to add call edges for calls shown
+    // in context.
     std::queue<ContextTrieNode *> Queue;
-    Queue.push(&ContextTracker.getRootContext());
+    for (auto &Child : ContextTracker.getRootContext().getAllChildContext()) {
+      ContextTrieNode *Callee = &Child.second;
+      addProfiledFunction(Callee->getFuncName());
+      Queue.push(Callee);
+    }
+
     while (!Queue.empty()) {
       ContextTrieNode *Caller = Queue.front();
       Queue.pop();
-      FunctionSamples *CallerSamples = Caller->getFunctionSamples();
-
-      // Add calls for context, if both caller and callee has context profile.
+      // Add calls for context. When AddNodeWithSamplesOnly is true, both caller
+      // and callee need to have context profile.
+      // Callsite target samples are not considered. A call target that doesn't
+      // come with a profile or is not on a call path to its child profile can
+      // be ignored, since processing it before its caller (if this is the only
+      // context) should not cause a problem.
       for (auto &Child : Caller->getAllChildContext()) {
         ContextTrieNode *Callee = &Child.second;
+        addProfiledFunction(Callee->getFuncName());
         Queue.push(Callee);
-        if (CallerSamples && Callee->getFunctionSamples()) {
+        if (!AddNodeWithSamplesOnly ||
+            (Caller->getFunctionSamples() && Callee->getFunctionSamples()))
           addProfiledCall(Caller->getFuncName(), Callee->getFuncName());
-        }
-      }
-
-      // Add calls from call site samples
-      if (CallerSamples) {
-        for (auto &LocCallSite : CallerSamples->getBodySamples()) {
-          for (auto &NameCallSite : LocCallSite.second.getCallTargets()) {
-            addProfiledCall(Caller->getFuncName(), NameCallSite.first());
-          }
-        }
       }
     }
   }
@@ -89,6 +93,7 @@
       ProfiledFunctions[Name] = ProfiledCallGraphNode(Name);
     }
   }
+
   void addProfiledCall(StringRef CallerName, StringRef CalleeName) {
     assert(ProfiledFunctions.count(CallerName));
     auto CalleeIt = ProfiledFunctions.find(CalleeName);
@@ -98,6 +103,25 @@
     ProfiledFunctions[CallerName].Callees.insert(&CalleeIt->second);
   }
 
+  void addProfiledCalls(const FunctionSamples &Samples) {
+    addProfiledFunction(Samples.getFuncName());
+
+    for (const auto &Sample : Samples.getBodySamples()) {
+      for (const auto &Target : Sample.second.getCallTargets()) {
+        addProfiledFunction(Target.first());
+        addProfiledCall(Samples.getFuncName(), Target.first());
+      }
+    }
+
+    for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) {
+      for (const auto &InlinedSamples : CallsiteSamples.second) {
+        addProfiledFunction(InlinedSamples.first);
+        addProfiledCall(Samples.getFuncName(), InlinedSamples.first);
+        addProfiledCalls(InlinedSamples.second);
+      }
+    }
+  }
+
 private:
   ProfiledCallGraphNode Root;
   StringMap<ProfiledCallGraphNode> ProfiledFunctions;
diff --git a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
--- a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
@@ -18,7 +18,6 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/ProfileData/SampleProf.h"
@@ -124,7 +123,6 @@
   ContextTrieNode &getRootContext();
   void promoteMergeContextSamplesTree(const Instruction &Inst,
                                       StringRef CalleeName);
-  void addCallGraphEdges(CallGraph &CG, StringMap<Function *> &SymbolMap);
   // Dump the internal context profile trie.
   void dump();
 
diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
--- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
+++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -568,26 +568,4 @@
 
   return *ToNode;
 }
-
-// Replace call graph edges with dynamic call edges from the profile.
-void SampleContextTracker::addCallGraphEdges(CallGraph &CG,
-                                             StringMap<Function *> &SymbolMap) {
-  // Add profile call edges to the call graph.
-  std::queue<ContextTrieNode *> NodeQueue;
-  NodeQueue.push(&RootContext);
-  while (!NodeQueue.empty()) {
-    ContextTrieNode *Node = NodeQueue.front();
-    NodeQueue.pop();
-    Function *F = SymbolMap.lookup(Node->getFuncName());
-    for (auto &I : Node->getAllChildContext()) {
-      ContextTrieNode *ChildNode = &I.second;
-      NodeQueue.push(ChildNode);
-      if (F && !F->isDeclaration()) {
-        Function *Callee = SymbolMap.lookup(ChildNode->getFuncName());
-        if (Callee && !Callee->isDeclaration())
-          CG[F]->addCalledFunction(nullptr, CG[Callee]);
-      }
-    }
-  }
-}
 } // namespace llvm
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -77,6 +77,7 @@
 #include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/ProfiledCallGraph.h"
 #include "llvm/Transforms/IPO/SampleContextTracker.h"
 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
 #include "llvm/Transforms/Instrumentation.h"
@@ -160,15 +161,11 @@
              "order of call graph during sample profile loading. It only "
              "works for new pass manager. "));
 
-static cl::opt<bool> UseProfileIndirectCallEdges(
-    "use-profile-indirect-call-edges", cl::init(true), cl::Hidden,
-    cl::desc("Considering indirect call samples from profile when top-down "
-             "processing functions. Only CSSPGO is supported."));
-
-static cl::opt<bool> UseProfileTopDownOrder(
-    "use-profile-top-down-order", cl::init(false), cl::Hidden,
-    cl::desc("Process functions in one SCC in a top-down order "
-             "based on the input profile."));
+static cl::opt<bool>
+    UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden,
+                         cl::desc("Process functions in a top-down order "
+                                  "defined by the profiled call graph when "
+                                  "-sample-profile-top-down-load is on."));
 
 static cl::opt<bool> ProfileSizeInline(
     "sample-profile-inline-size", cl::Hidden, cl::init(false),
@@ -395,8 +392,7 @@
       const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
       bool Hot);
   std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG);
-  void addCallGraphEdges(CallGraph &CG, const FunctionSamples &Samples);
-  void replaceCallGraphEdges(CallGraph &CG, StringMap<Function *> &SymbolMap);
+  std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(CallGraph &CG);
   void generateMDProfMetadata(Function &F);
 
   /// Map from function name to Function *. Used to find the function from
@@ -1579,43 +1575,25 @@
 INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
                     "Sample Profile loader", false, false)
 
-// Add inlined profile call edges to the call graph.
-void SampleProfileLoader::addCallGraphEdges(CallGraph &CG,
-                                            const FunctionSamples &Samples) {
-  Function *Caller = SymbolMap.lookup(Samples.getFuncName());
-  if (!Caller || Caller->isDeclaration())
-    return;
-
-  // Skip non-inlined call edges which are not important since top down inlining
-  // for non-CS profile is to get more precise profile matching, not to enable
-  // more inlining.
-
-  for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) {
-    for (const auto &InlinedSamples : CallsiteSamples.second) {
-      Function *Callee = SymbolMap.lookup(InlinedSamples.first);
-      if (Callee && !Callee->isDeclaration())
-        CG[Caller]->addCalledFunction(nullptr, CG[Callee]);
-      addCallGraphEdges(CG, InlinedSamples.second);
-    }
+std::unique_ptr<ProfiledCallGraph>
+SampleProfileLoader::buildProfiledCallGraph(CallGraph &CG) {
+  std::unique_ptr<ProfiledCallGraph> ProfiledCG;
+  if (ProfileIsCS)
+    ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);
+  else
+    ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());
+
+  // Add all functions into the profiled call graph even if they are not in
+  // the profile. This makes sure functions missing from the profile still
+  // gets a chance to be processed.
+  for (auto &Node : CG) {
+    const auto *F = Node.first;
+    if (!F || F->isDeclaration() || !F->hasFnAttribute("use-sample-profile"))
+      continue;
+    ProfiledCG->addProfiledFunction(FunctionSamples::getCanonicalFnName(*F));
   }
-}
 
-// Replace call graph edges with dynamic call edges from the profile.
-void SampleProfileLoader::replaceCallGraphEdges(
-    CallGraph &CG, StringMap<Function *> &SymbolMap) {
-  // Remove static call edges from the call graph except for the ones from the
-  // root which make the call graph connected.
-  for (const auto &Node : CG)
-    if (Node.second.get() != CG.getExternalCallingNode())
-      Node.second->removeAllCalledFunctions();
-
-  // Add profile call edges to the call graph.
-  if (ProfileIsCS) {
-    ContextTracker->addCallGraphEdges(CG, SymbolMap);
-  } else {
-    for (const auto &Samples : Reader->getProfiles())
-      addCallGraphEdges(CG, Samples.second);
-  }
+  return ProfiledCG;
 }
 
 std::vector<Function *>
@@ -1641,89 +1619,78 @@
 
   assert(&CG->getModule() == &M);
 
-  // Add indirect call edges from profile to augment the static call graph.
-  // Functions will be processed in a top-down order defined by the static call
-  // graph. Adjusting the order by considering indirect call edges from the
-  // profile (which don't exist in the static call graph) can enable the
-  // inlining of indirect call targets by processing the caller before them.
-  // TODO: enable this for non-CS profile and fix the counts returning logic to
-  // have a full support for indirect calls.
-  if (UseProfileIndirectCallEdges && ProfileIsCS) {
-    for (auto &Entry : *CG) {
-      const auto *F = Entry.first;
-      if (!F || F->isDeclaration() || !F->hasFnAttribute("use-sample-profile"))
-        continue;
-      auto &AllContexts = ContextTracker->getAllContextSamplesFor(F->getName());
-      if (AllContexts.empty())
-        continue;
-
-      for (const auto &BB : *F) {
-        for (const auto &I : BB.getInstList()) {
-          const auto *CB = dyn_cast<CallBase>(&I);
-          if (!CB || !CB->isIndirectCall())
-            continue;
-          const DebugLoc &DLoc = I.getDebugLoc();
-          if (!DLoc)
-            continue;
-          auto CallSite = FunctionSamples::getCallSiteIdentifier(DLoc);
-          for (FunctionSamples *Samples : AllContexts) {
-            if (auto CallTargets = Samples->findCallTargetMapAt(CallSite)) {
-              for (const auto &Target : CallTargets.get()) {
-                Function *Callee = SymbolMap.lookup(Target.first());
-                if (Callee && !Callee->isDeclaration())
-                  Entry.second->addCalledFunction(nullptr, (*CG)[Callee]);
-              }
-            }
-          }
-        }
+  if (UseProfiledCallGraph ||
+      (ProfileIsCS && !UseProfiledCallGraph.getNumOccurrences())) {
+    // Use profiled call edges to augment the top-down order. There are cases
+    // that the top-down order computed based on the static call graph doesn't
+    // reflect real execution order. For example
+    //
+    // 1. Incomplete static call graph due to unknown indirect call targets.
+    //    Adjusting the order by considering indirect call edges from the
+    //    profile can enable the inlining of indirect call targets by allowing
+    //    the caller processed before them.
+    // 2. Mutual call edges in an SCC. The static processing order computed for
+    //    an SCC may not reflect the call contexts in the context-sensitive
+    //    profile, thus may cause potential inlining to be overlooked. The
+    //    function order in one SCC is being adjusted to a top-down order based
+    //    on the profile to favor more inlining. This is only a problem with CS
+    //    profile.
+    // 3. Transitive indirect call edges due to inlining. When a callee function
+    //    (say B) is inlined into into a caller function (say A) in LTO prelink,
+    //    every call edge originated from the callee B will be transferred to
+    //    the caller A. If any transferred edge (say A->C) is indirect, the
+    //    original profiled indirect edge B->C, even if considered, would not
+    //    enforce a top-down order from the caller A to the potential indirect
+    //    call target C in LTO postlink since the inlined callee B is gone from
+    //    the static call graph.
+    // 4. #3 can happen even for direct call targets, due to functions defined
+    //    in header files. A header function (say A), when included into source
+    //    files, is defined multiple times but only one definition survives due
+    //    to ODR. Therefore, the LTO prelink inlining done on those dropped
+    //    definitions can be useless based on a local file scope. More
+    //    importantly, the inlinee (say B), once fully inlined to a
+    //    to-be-dropped A, will have no profile to consume when its outlined
+    //    version is compiled. This can lead to a profile-less prelink
+    //    compilation for the outlined version of B which may be called from
+    //    external modules. while this isn't easy to fix, we rely on the
+    //    postlink AutoFDO pipeline to optimize B. Since the survived copy of
+    //    the A can be inlined in its local scope in prelink, it may not exist
+    //    in the merged IR in postlink, and we'll need the profiled call edges
+    //    to enforce a top-down order for the rest of the functions.
+    //
+    // Considering those cases, a profiled call graph completely independent of
+    // the static call graph is constructed based on profile data, where
+    // function objects are not even needed to handle case #3 and case 4.
+    //
+    // Note that static callgraph edges are completely ignored since they
+    // can be conflicting with profiled edges for cyclic SCCs and may result in
+    // an SCC order incompatible with profile-defined one. Using strictly
+    // profile order ensures a maximum inlining experience. On the other hand,
+    // static call edges are not so important when they don't correspond to a
+    // context in the profile.
+
+    std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(*CG);
+    scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());
+    while (!CGI.isAtEnd()) {
+      for (ProfiledCallGraphNode *Node : *CGI) {
+        Function *F = SymbolMap.lookup(Node->Name);
+        if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
+          FunctionOrderList.push_back(F);
       }
+      ++CGI;
     }
-  }
-
-  // Compute a top-down order the profile which is used to sort functions in
-  // one SCC later. The static processing order computed for an SCC may not
-  // reflect the call contexts in the context-sensitive profile, thus may cause
-  // potential inlining to be overlooked. The function order in one SCC is being
-  // adjusted to a top-down order based on the profile to favor more inlining.
-  DenseMap<Function *, uint64_t> ProfileOrderMap;
-  if (UseProfileTopDownOrder ||
-      (ProfileIsCS && !UseProfileTopDownOrder.getNumOccurrences())) {
-    // Create a static call graph. The call edges are not important since they
-    // will be replaced by dynamic edges from the profile.
-    CallGraph ProfileCG(M);
-    replaceCallGraphEdges(ProfileCG, SymbolMap);
-    scc_iterator<CallGraph *> CGI = scc_begin(&ProfileCG);
-    uint64_t I = 0;
+  } else {
+    scc_iterator<CallGraph *> CGI = scc_begin(CG);
     while (!CGI.isAtEnd()) {
       for (CallGraphNode *Node : *CGI) {
-        if (auto *F = Node->getFunction())
-          ProfileOrderMap[F] = ++I;
+        auto *F = Node->getFunction();
+        if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
+          FunctionOrderList.push_back(F);
       }
       ++CGI;
     }
   }
 
-  scc_iterator<CallGraph *> CGI = scc_begin(CG);
-  while (!CGI.isAtEnd()) {
-    uint64_t Start = FunctionOrderList.size();
-    for (CallGraphNode *Node : *CGI) {
-      auto *F = Node->getFunction();
-      if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
-        FunctionOrderList.push_back(F);
-    }
-
-    // Sort nodes in SCC based on the profile top-down order.
-    if (!ProfileOrderMap.empty()) {
-      std::stable_sort(FunctionOrderList.begin() + Start,
-                       FunctionOrderList.end(),
-                       [&ProfileOrderMap](Function *Left, Function *Right) {
-                         return ProfileOrderMap[Left] < ProfileOrderMap[Right];
-                       });
-    }
-
-    ++CGI;
-  }
-
   LLVM_DEBUG({
     dbgs() << "Function processing order:\n";
     for (auto F : reverse(FunctionOrderList)) {
diff --git a/llvm/test/Transforms/SampleProfile/ctxsplit.ll b/llvm/test/Transforms/SampleProfile/ctxsplit.ll
--- a/llvm/test/Transforms/SampleProfile/ctxsplit.ll
+++ b/llvm/test/Transforms/SampleProfile/ctxsplit.ll
@@ -1,14 +1,14 @@
 ; Check the nonflattened part of the ctxsplit profile will be read in thinlto
 ; postlink phase while flattened part of the ctxsplit profile will not be read.
-; RUN: opt < %s -passes='thinlto<O2>' -pgo-kind=pgo-sample-use-pipeline -profile-file=%S/Inputs/ctxsplit.extbinary.afdo -S | FileCheck %s --check-prefix=POSTLINK
+; RUN: opt < %s -passes='thinlto<O2>' -pgo-kind=pgo-sample-use-pipeline -use-profiled-call-graph=0 -profile-file=%S/Inputs/ctxsplit.extbinary.afdo -S | FileCheck %s --check-prefix=POSTLINK
 ;
 ; Check both the flattened and nonflattened parts of the ctxsplit profile will
 ; be read in thinlto prelink phase.
-; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -profile-file=%S/Inputs/ctxsplit.extbinary.afdo -S | FileCheck %s --check-prefix=PRELINK
+; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -use-profiled-call-graph=0 -profile-file=%S/Inputs/ctxsplit.extbinary.afdo -S | FileCheck %s --check-prefix=PRELINK
 ;
 ; Check both the flattened and nonflattened parts of the ctxsplit profile will
 ; be read in non-thinlto mode.
-; RUN: opt < %s -passes='default<O2>' -pgo-kind=pgo-sample-use-pipeline -profile-file=%S/Inputs/ctxsplit.extbinary.afdo -S | FileCheck %s --check-prefix=NOTHINLTO
+; RUN: opt < %s -passes='default<O2>' -pgo-kind=pgo-sample-use-pipeline -use-profiled-call-graph=0 -profile-file=%S/Inputs/ctxsplit.extbinary.afdo -S | FileCheck %s --check-prefix=NOTHINLTO
 
 ; POSTLINK: define dso_local i32 @goo() {{.*}} !prof ![[ENTRY1:[0-9]+]] {
 ; POSTLINK: define dso_local i32 @foo() {{.*}} !prof ![[ENTRY2:[0-9]+]] {
diff --git a/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll b/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll
--- a/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll
+++ b/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll
@@ -1,14 +1,14 @@
 ; Test we lose details of not inlined profile without '-sample-profile-merge-inlinee'
-; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=false -enable-new-pm=0 -S | FileCheck -check-prefix=SCALE %s
-; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=true -enable-new-pm=0 -S | FileCheck -check-prefix=SCALE %s
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=false -S | FileCheck -check-prefix=SCALE %s
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=false -use-profiled-call-graph=0 -enable-new-pm=0 -S | FileCheck -check-prefix=SCALE %s
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=true -use-profiled-call-graph=0 -enable-new-pm=0 -S | FileCheck -check-prefix=SCALE %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=false -use-profiled-call-graph=0 -S | FileCheck -check-prefix=SCALE %s
 
 ; Test we properly merge not inlined profile with '-sample-profile-merge-inlinee'
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=true -S | FileCheck -check-prefix=MERGE  %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=true -use-profiled-call-graph=0 -S | FileCheck -check-prefix=MERGE  %s
 
 ; Test we properly merge not inlined profile with '-sample-profile-merge-inlinee'
 ; when the profile uses md5.
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.md5.prof -sample-profile-merge-inlinee=true -S | FileCheck -check-prefix=MERGE  %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.md5.prof -sample-profile-merge-inlinee=true -use-profiled-call-graph=0 -S | FileCheck -check-prefix=MERGE  %s
 
 @.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1
 
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-order.ll b/llvm/test/Transforms/SampleProfile/profile-context-order.ll
--- a/llvm/test/Transforms/SampleProfile/profile-context-order.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-order.ll
@@ -1,21 +1,21 @@
 ;; Test for different function processing orders affecting inlining in sample profile loader.
 
 ;; There is an SCC _Z5funcAi -> _Z8funcLeafi -> _Z5funcAi in the program.
-;; With -use-profile-top-down-order=0, the top-down processing order of
+;; With -use-profiled-call-graph=0, the top-down processing order of
 ;; that SCC is (_Z8funcLeafi, _Z5funcAi), which is determinined based on
-;; the static call graph. With -use-profile-top-down-order=1, call edges
+;; the static call graph. With -use-profiled-call-graph=1, call edges
 ;; from profile are considered, thus the order becomes (_Z5funcAi, _Z8funcLeafi)
 ;; which leads to _Z8funcLeafi inlined into _Z5funcAi.
-; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=INLINE
-; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=0 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=NOINLINE
+; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=INLINE
+; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=0 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=NOINLINE
 
 ;; There is an indirect call _Z5funcAi -> _Z3fibi in the program.
-;; With -use-profile-indirect-call-edges=0, the processing order computed
+;; With -use-profiled-call-graph=0, the processing order computed
 ;; based on the static call graph is (_Z3fibi, _Z5funcAi). With 
-;; -use-profile-top-down-order=1, the indirect call edge from profile is
+;; -use-profiled-call-graph=1, the indirect call edge from profile is
 ;; considered, thus the order becomes (_Z5funcAi, _Z3fibi) which leads to
 ;; _Z3fibi inlined into _Z5funcAi.
-; RUN: opt < %s -passes=sample-profile -use-profile-indirect-call-edges=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=ICALL-INLINE
+; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=ICALL-INLINE
 
 @factor = dso_local global i32 3, align 4, !dbg !0
 @fp = dso_local global i32 (i32)* null, align 8
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
--- a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
@@ -25,8 +25,6 @@
 ; INLINE-ALL-NEXT: Marking context profile as inlined: main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
 ; INLINE-ALL-NEXT: Getting callee context for instr:   %call.i1 = tail call i32 @_Z3fibi
 ; INLINE-ALL-NEXT: Getting callee context for instr:   %call5.i = tail call i32 @_Z3fibi
-; INLINE-ALL-NEXT: Getting base profile for function: _Z5funcAi
-; INLINE-ALL-NEXT:   Merging context profile into base profile: _Z5funcAi
 ; INLINE-ALL-NEXT: Getting base profile for function: _Z5funcBi
 ; INLINE-ALL-NEXT:   Merging context profile into base profile: _Z5funcBi
 ; INLINE-ALL-NEXT:   Found context tree root to promote: external:10 @ _Z5funcBi
@@ -41,6 +39,8 @@
 ; INLINE-ALL-NEXT: Marking context profile as inlined: _Z5funcBi:1 @ _Z8funcLeafi
 ; INLINE-ALL-NEXT: Getting callee context for instr:   %call.i = tail call i32 @_Z3fibi
 ; INLINE-ALL-NEXT: Getting callee context for instr:   %call5.i = tail call i32 @_Z3fibi
+; INLINE-ALL-NEXT: Getting base profile for function: _Z5funcAi
+; INLINE-ALL-NEXT:   Merging context profile into base profile: _Z5funcAi
 ; INLINE-ALL-NEXT: Getting base profile for function: _Z8funcLeafi
 ; INLINE-ALL-NEXT:   Merging context profile into base profile: _Z8funcLeafi
 
@@ -55,16 +55,6 @@
 ; INLINE-HOT-NEXT:   Callee context found: main:3.1 @ _Z5funcBi
 ; INLINE-HOT-NEXT: Getting callee context for instr:   %call1 = tail call i32 @_Z5funcAi
 ; INLINE-HOT-NEXT:   Callee context found: main:3 @ _Z5funcAi
-; INLINE-HOT-NEXT: Getting base profile for function: _Z5funcAi
-; INLINE-HOT-NEXT:   Merging context profile into base profile: _Z5funcAi
-; INLINE-HOT-NEXT:   Found context tree root to promote: main:3 @ _Z5funcAi
-; INLINE-HOT-NEXT:   Context promoted to: _Z5funcAi
-; INLINE-HOT-NEXT:   Context promoted to: _Z5funcAi:1 @ _Z8funcLeafi
-; INLINE-HOT-NEXT:   Getting callee context for instr:   %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !50
-; INLINE-HOT-NEXT:   Callee context found: _Z5funcAi:1 @ _Z8funcLeafi
-; INLINE-HOT-NEXT: Marking context profile as inlined: _Z5funcAi:1 @ _Z8funcLeafi
-; INLINE-HOT-NEXT: Getting callee context for instr:   %call.i = tail call i32 @_Z3fibi(i32 %tmp.i) #2, !dbg !62
-; INLINE-HOT-NEXT: Getting callee context for instr:   %call5.i = tail call i32 @_Z3fibi(i32 %tmp1.i) #2, !dbg !69
 ; INLINE-HOT-NEXT: Getting base profile for function: _Z5funcBi
 ; INLINE-HOT-NEXT:   Merging context profile into base profile: _Z5funcBi
 ; INLINE-HOT-NEXT:   Found context tree root to promote: external:10 @ _Z5funcBi
@@ -79,6 +69,16 @@
 ; INLINE-HOT-NEXT: Marking context profile as inlined: _Z5funcBi:1 @ _Z8funcLeafi
 ; INLINE-HOT-NEXT: Getting callee context for instr:   %call.i = tail call i32 @_Z3fibi
 ; INLINE-HOT-NEXT: Getting callee context for instr:   %call5.i = tail call i32 @_Z3fibi
+; INLINE-HOT-NEXT: Getting base profile for function: _Z5funcAi
+; INLINE-HOT-NEXT:   Merging context profile into base profile: _Z5funcAi
+; INLINE-HOT-NEXT:   Found context tree root to promote: main:3 @ _Z5funcAi
+; INLINE-HOT-NEXT:   Context promoted to: _Z5funcAi
+; INLINE-HOT-NEXT:   Context promoted to: _Z5funcAi:1 @ _Z8funcLeafi
+; INLINE-HOT-NEXT:   Getting callee context for instr:   %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !50
+; INLINE-HOT-NEXT:   Callee context found: _Z5funcAi:1 @ _Z8funcLeafi
+; INLINE-HOT-NEXT: Marking context profile as inlined: _Z5funcAi:1 @ _Z8funcLeafi
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call.i = tail call i32 @_Z3fibi(i32 %tmp.i) #2, !dbg !62
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call5.i = tail call i32 @_Z3fibi(i32 %tmp1.i) #2, !dbg !69
 ; INLINE-HOT-NEXT: Getting base profile for function: _Z8funcLeafi
 ; INLINE-HOT-NEXT:   Merging context profile into base profile: _Z8funcLeafi
 
diff --git a/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll b/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll
--- a/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll
@@ -1,14 +1,14 @@
 ;; Test for different function processing orders affecting inlining in sample profile loader.
 
 ;; There is an SCC _Z5funcAi -> _Z8funcLeafi -> _Z5funcAi in the program.
-;; With -use-profile-top-down-order=0, the top-down processing order of
+;; With -use-profiled-call-graph=0, the top-down processing order of
 ;; that SCC is (_Z8funcLeafi, _Z5funcAi), which is determinined based on
-;; the static call graph. With -use-profile-top-down-order=1, call edges
+;; the static call graph. With -use-profiled-call-graph=1, call edges
 ;; from profile are considered, thus the order becomes (_Z5funcAi, _Z8funcLeafi).
 ;; While _Z8funcLeafi is not supposed to be inlined, the outlined entry counts
 ;; are affected.
-; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=0 -sample-profile-file=%S/Inputs/profile-topdown-order.prof -S | FileCheck %s -check-prefix=STATIC
-; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=1 -sample-profile-file=%S/Inputs/profile-topdown-order.prof -S | FileCheck %s -check-prefix=DYNAMIC
+; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=0 -sample-profile-file=%S/Inputs/profile-topdown-order.prof -S | FileCheck %s -check-prefix=STATIC
+; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=1 -sample-profile-file=%S/Inputs/profile-topdown-order.prof -S | FileCheck %s -check-prefix=DYNAMIC
 
 
 ; STATIC:  define dso_local i32 @_Z8funcLeafi{{.*}} !prof ![[#PROF:]]
diff --git a/llvm/tools/llvm-profgen/CSPreInliner.cpp b/llvm/tools/llvm-profgen/CSPreInliner.cpp
--- a/llvm/tools/llvm-profgen/CSPreInliner.cpp
+++ b/llvm/tools/llvm-profgen/CSPreInliner.cpp
@@ -42,7 +42,7 @@
 
 std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
   std::vector<StringRef> Order;
-  ProfiledCallGraph ProfiledCG(ProfileMap, ContextTracker);
+  ProfiledCallGraph ProfiledCG(ContextTracker, true);
 
   // Now that we have a profiled call graph, construct top-down order
   // by building up SCC and reversing SCC order.