diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -208,6 +208,13 @@
   SecFlagHasAttribute = (1 << 1)
 };
 
+enum class SecFuncOffsetFlags : uint32_t {
+  SecFlagInvalid = 0,
+  // Store function offsets in an order of contexts. The order ensures that
+  // callee contexts of a given context laid out next to it.
+  SecFlagOrdered = (1 << 0),
+};
+
 // Verify section specific flag is used for the correct section.
 template <class SecFlagType>
 static inline void verifySecFlag(SecType Type, SecFlagType Flag) {
@@ -228,6 +235,8 @@
     IsFlagLegal = std::is_same<SecFuncMetadataFlags, SecFlagType>();
     break;
   default:
+  case SecFuncOffsetTable:
+    IsFlagLegal = std::is_same<SecFuncOffsetFlags, SecFlagType>();
     break;
   }
   if (!IsFlagLegal)
diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -720,6 +720,11 @@
   /// The table mapping from function context to the offset of its
   /// FunctionSample towards file start.
   DenseMap<SampleContext, uint64_t> FuncOffsetTable;
+
+  /// Function offset mapping ordered by contexts.
+  std::unique_ptr<std::vector<std::pair<SampleContext, uint64_t>>>
+      OrderedFuncOffsets;
+
   /// The set containing the functions to use when compiling a module.
   DenseSet<StringRef> FuncsToUse;
 
@@ -746,6 +751,8 @@
   /// SecFlagFlat flag.
   bool SkipFlatProf = false;
 
+  bool FuncOffsetsOrdered = false;
+
 public:
   SampleProfileReaderExtBinaryBase(std::unique_ptr<MemoryBuffer> B,
                                    LLVMContext &C, SampleProfileFormat Format)
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -675,6 +675,7 @@
       return EC;
     break;
   case SecFuncOffsetTable:
+    FuncOffsetsOrdered = hasSecFlag(Entry, SecFuncOffsetFlags::SecFlagOrdered);
     if (std::error_code EC = readFuncOffsetTable())
       return EC;
     break;
@@ -720,17 +721,27 @@
     return EC;
 
   FuncOffsetTable.reserve(*Size);
+
+  if (FuncOffsetsOrdered) {
+    OrderedFuncOffsets =
+        std::make_unique<std::vector<std::pair<SampleContext, uint64_t>>>();
+    OrderedFuncOffsets->reserve(*Size);
+  }
+
   for (uint32_t I = 0; I < *Size; ++I) {
-    auto FName(readSampleContextFromTable());
-    if (std::error_code EC = FName.getError())
+    auto FContext(readSampleContextFromTable());
+    if (std::error_code EC = FContext.getError())
       return EC;
 
     auto Offset = readNumber<uint64_t>();
     if (std::error_code EC = Offset.getError())
       return EC;
 
-    FuncOffsetTable[*FName] = *Offset;
+    FuncOffsetTable[*FContext] = *Offset;
+    if (FuncOffsetsOrdered)
+      OrderedFuncOffsets->emplace_back(*FContext, *Offset);
   }
+
   return sampleprof_error::success;
 }
 
@@ -760,47 +771,62 @@
     }
 
     if (ProfileIsCS) {
-      // Compute the ordered set of names, so we can
-      // get all context profiles under a subtree by
-      // iterating through the ordered names.
-      std::set<SampleContext> OrderedContexts;
-      for (auto Name : FuncOffsetTable) {
-        OrderedContexts.insert(Name.first);
-      }
-
       DenseSet<uint64_t> FuncGuidsToUse;
       if (useMD5()) {
         for (auto Name : FuncsToUse)
           FuncGuidsToUse.insert(Function::getGUID(Name));
       }
 
-      // For each function in current module, load all
-      // context profiles for the function.
-      for (auto NameOffset : FuncOffsetTable) {
-        SampleContext FContext = NameOffset.first;
-        auto FuncName = FContext.getName();
-        if (useMD5()) {
-          if (!FuncGuidsToUse.count(std::stoull(FuncName.data())))
-            continue;
-        } else {
-          if (!FuncsToUse.count(FuncName) &&
-              (!Remapper || !Remapper->exist(FuncName)))
-            continue;
+      // For each function in current module, load all context profiles for
+      // the function as well as its callee contexts which can help profile
+      // guided importing for ThinLTO. This can be achieved by walking through
+      // an ordered context container, where contexts are laid out as if they
+      // were walked in preorder of a context trie. While traversing the
+      // trie, a link to the highest common ancestor node is kept so that all of
+      // its decendants will be loaded.
+
+      const SampleContext *CommonContext = nullptr;
+      auto LoadProfile = [&](const SampleContext &FContext, uint64_t Offset) {
+        auto FName = FContext.getName();
+        // For function in the current module, keep its farthest ancestor
+        // context. This can be used to load itself and its child and
+        // sibling contexts.
+        if ((useMD5() && FuncGuidsToUse.count(std::stoull(FName.data()))) ||
+            (!useMD5() && (FuncsToUse.count(FName) ||
+                           (Remapper && Remapper->exist(FName))))) {
+          if (!CommonContext || !CommonContext->IsPrefixOf(FContext))
+            CommonContext = &FContext;
         }
 
-        // For each context profile we need, try to load
-        // all context profile in the subtree. This can
-        // help profile guided importing for ThinLTO.
-        auto It = OrderedContexts.find(FContext);
-        while (It != OrderedContexts.end() && FContext.IsPrefixOf(*It)) {
-          const uint8_t *FuncProfileAddr = Start + FuncOffsetTable[*It];
+        if (CommonContext == &FContext ||
+            (CommonContext && CommonContext->IsPrefixOf(FContext))) {
+          // Load profile for the current context which originated from
+          // the common ancestor.
+          const uint8_t *FuncProfileAddr = Start + Offset;
           assert(FuncProfileAddr < End && "out of LBRProfile section");
           if (std::error_code EC = readFuncProfile(FuncProfileAddr))
             return EC;
-          // Remove loaded context profile so we won't
-          // load it repeatedly.
-          It = OrderedContexts.erase(It);
+        } else {
+          // Reset the ancestor since the current context is out of the subtree
+          // rooted from the ancestor.
+          CommonContext = nullptr;
         }
+        return (std::error_code)sampleprof_error::success;
+      };
+
+      if (OrderedFuncOffsets.get()) {
+        for (auto &NameOffset : *OrderedFuncOffsets)
+          if (std::error_code EC =
+                  LoadProfile(NameOffset.first, NameOffset.second))
+            return EC;
+      }
+      else {
+        std::map<SampleContext, uint64_t> OrderedContexts(
+            FuncOffsetTable.begin(), FuncOffsetTable.end());
+        for (auto &NameOffset : OrderedContexts)
+          if (std::error_code EC =
+                  LoadProfile(NameOffset.first, NameOffset.second))
+            return EC;
       }
     } else {
       if (useMD5()) {
@@ -1216,6 +1242,10 @@
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator))
       Flags.append("fs-discriminator,");
     break;
+  case SecFuncOffsetTable:
+    if (hasSecFlag(Entry, SecFuncOffsetFlags::SecFlagOrdered))
+      Flags.append("ordered,");
+    break;
   default:
     break;
   }
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -41,6 +41,11 @@
 using namespace llvm;
 using namespace sampleprof;
 
+static cl::opt<bool> SortFuncOffsetTable(
+    "sort-func-offsets", cl::Hidden, cl::init(true),
+    cl::desc("Sort funcion offset table in context order. Only works with CS "
+             "and extbinary profiles"));
+
 std::error_code
 SampleProfileWriter::writeFuncProfiles(const SampleProfileMap &ProfileMap) {
   std::vector<NameFunctionSamples> V;
@@ -165,11 +170,29 @@
   encodeULEB128(FuncOffsetTable.size(), OS);
 
   // Write out FuncOffsetTable.
-  for (auto Entry : FuncOffsetTable) {
-    if (std::error_code EC = writeContextIdx(Entry.first))
+  auto WriteItem = [&](const SampleContext &Context, uint64_t Offset) {
+    if (std::error_code EC = writeContextIdx(Context))
       return EC;
-    encodeULEB128(Entry.second, OS);
+    encodeULEB128(Offset, OS);
+    return (std::error_code)sampleprof_error::success;
+  };
+
+  if (FunctionSamples::ProfileIsCS && SortFuncOffsetTable) {
+    // Sort the contexts before writing them out.
+    std::map<SampleContext, uint64_t> OrderedFuncOffsetTable(
+        FuncOffsetTable.begin(), FuncOffsetTable.end());
+    for (const auto &Entry : OrderedFuncOffsetTable) {
+      if (std::error_code EC = WriteItem(Entry.first, Entry.second))
+        return EC;
+    }
+    addSectionFlag(SecFuncOffsetTable, SecFuncOffsetFlags::SecFlagOrdered);
+  } else {
+    for (const auto &Entry : FuncOffsetTable) {
+      if (std::error_code EC = WriteItem(Entry.first, Entry.second))
+        return EC;
+    }
   }
+
   FuncOffsetTable.clear();
   return sampleprof_error::success;
 }
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
--- a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
@@ -1,9 +1,16 @@
 ; Make sure Import GUID list for ThinLTO properly set for CSSPGO
 ; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list.prof -S | FileCheck %s
-; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/csspgo-import-list.prof -o %t.prof
+; RUN: llvm-profdata merge --sample --extbinary --sort-func-offsets=0 %S/Inputs/csspgo-import-list.prof -o %t.prof
 ; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%t.prof -S | FileCheck %s
-; RUN: llvm-profdata merge --sample --extbinary --use-md5 %S/Inputs/csspgo-import-list.prof -o %t.md5
+; RUN: llvm-profdata merge --sample --extbinary --sort-func-offsets=1 %S/Inputs/csspgo-import-list.prof -o %t.ordered.prof
+; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%t.ordered.prof -S | FileCheck %s
+; RUN: llvm-profdata show --sample -show-sec-info-only %t.ordered.prof | FileCheck %s --check-prefix=CHECK-ORDERED
+; RUN: llvm-profdata merge --sample --extbinary --use-md5 --sort-func-offsets=0 %S/Inputs/csspgo-import-list.prof -o %t.md5
 ; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%t.md5 -S | FileCheck %s
+; RUN: llvm-profdata merge --sample --extbinary --use-md5 --sort-func-offsets=1 %S/Inputs/csspgo-import-list.prof -o %t.ordered.md5
+; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%t.ordered.md5 -S | FileCheck %s
+; RUN: llvm-profdata show --sample -show-sec-info-only %t.ordered.md5 | FileCheck %s --check-prefix=CHECK-ORDERED
+
 
 declare i32 @_Z5funcBi(i32 %x)
 declare i32 @_Z5funcAi(i32 %x)
@@ -32,6 +39,7 @@
 ; CHECK: distinct !DISubprogram(name: "main"
 ; CHECK: !{!"function_entry_count", i64 3, i64 446061515086924981, i64 3815895320998406042, i64 7102633082150537521, i64 -2862076748587597320}
 
+; CHECK-ORDERED: FuncOffsetTableSection {{.*}} {ordered}
 
 attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }