diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h
--- a/llvm/tools/llvm-profgen/PerfReader.h
+++ b/llvm/tools/llvm-profgen/PerfReader.h
@@ -10,6 +10,7 @@
 #define LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H
 #include "ErrorHandling.h"
 #include "ProfiledBinary.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Regex.h"
 #include <fstream>
@@ -75,8 +76,61 @@
       : Source(S), Target(T), IsArtificial(I) {}
 };
 
+// Hash interface for generic data of type T
+// Data should implement a \fn getHashCode and a \fn isEqual
+// Currently getHashCode is non-virtual to avoid the overhead of calling vtable,
+// i.e we explicitly calculate hash of derived class, assign to base class's
+// HashCode. This also provides the flexibility for calculating the hash code
+// incrementally(like rolling hash) during frame stack unwinding since unwinding
+// only changes the leaf of frame stack. \fn isEqual is a virtual function,
+// which will have perf overhead. In the future, if we redesign a better hash
+// function, then we can just skip this or switch to non-virtual function(like
+// just ignore comparision if hash conflicts probabilities is low)
+template <class T> class Hashable {
+public:
+  std::shared_ptr<T> Data;
+  Hashable(const std::shared_ptr<T> &D) : Data(D) {}
+
+  // Hash code generation
+  struct Hash {
+    uint64_t operator()(const Hashable<T> &Key) const {
+      // Don't make it virtual for getHashCode
+      return Key.Data->getHashCode();
+    }
+  };
+
+  // Hash compare
+  struct Equal {
+    bool operator()(const Hashable<T> &LHS, const Hashable<T> &RHS) const {
+      // Precisely compare the data, vtable will have overhead.
+      return LHS.Data->isEqual(RHS.Data.get());
+    }
+  };
+  T &getData() { return *Data; }
+
+  T *getPtr() const { return Data.get(); }
+};
+
+// Base class to extend for all types of perf sample
+struct PerfSample {
+  uint64_t HashCode = 0;
+
+  virtual ~PerfSample() = default;
+  uint64_t getHashCode() const { return HashCode; }
+  virtual bool isEqual(const PerfSample *K) const {
+    return HashCode == K->HashCode;
+  };
+
+  // Utilities for LLVM-style RTTI
+  enum PerfKind { PK_HybridSample };
+  const PerfKind Kind;
+  PerfKind getKind() const { return Kind; }
+  PerfSample(PerfKind K) : Kind(K){};
+};
+
 // The parsed hybrid sample including call stack and LBR stack.
-struct HybridSample {
+struct HybridSample : public PerfSample {
+
   // Profiled binary that current frame address belongs to
   ProfiledBinary *Binary;
   // Call stack recorded in FILO(leaf to root) order
@@ -84,12 +138,18 @@
   // LBR stack recorded in FIFO order
   SmallVector<LBREntry, 16> LBRStack;
 
+  HybridSample() : PerfSample(PK_HybridSample){};
+  static bool classof(const PerfSample *K) {
+    return K->getKind() == PK_HybridSample;
+  }
+
   // Used for sample aggregation
-  bool operator==(const HybridSample &Other) const {
-    if (Other.Binary != Binary)
+  bool isEqual(const PerfSample *K) const override {
+    const HybridSample *Other = dyn_cast<HybridSample>(K);
+    if (Other->Binary != Binary)
       return false;
-    const std::list<uint64_t> &OtherCallStack = Other.CallStack;
-    const SmallVector<LBREntry, 16> &OtherLBRStack = Other.LBRStack;
+    const std::list<uint64_t> &OtherCallStack = Other->CallStack;
+    const SmallVector<LBREntry, 16> &OtherLBRStack = Other->LBRStack;
 
     if (CallStack.size() != OtherCallStack.size() ||
         LBRStack.size() != OtherLBRStack.size())
@@ -108,8 +168,33 @@
     }
     return true;
   }
+
+  uint64_t hashCombine(uint64_t Hash, uint64_t Value) {
+    // Simple DJB2 hash
+    return ((Hash << 5) + Hash) + Value;
+  }
+
+  void genHashCode() {
+    uint64_t Hash = 5381;
+    Hash = hashCombine(Hash, reinterpret_cast<uint64_t>(Binary));
+    for (const auto &Value : CallStack) {
+      Hash = hashCombine(Hash, Value);
+    }
+    for (const auto &Entry : LBRStack) {
+      Hash = hashCombine(Hash, Entry.Source);
+      Hash = hashCombine(Hash, Entry.Target);
+    }
+    HashCode = Hash;
+  }
 };
 
+// After parsing the sample, we record the samples by aggregating them
+// into this counter. The key stores the sample data and the value is
+// the sample repeat times.
+using AggregationCounter =
+    std::unordered_map<Hashable<PerfSample>, uint64_t,
+                       Hashable<PerfSample>::Hash, Hashable<PerfSample>::Equal>;
+
 // The state for the unwinder, it doesn't hold the data but only keep the
 // pointer/index of the data, While unwinding, the CallStack is changed
 // dynamicially and will be recorded as the context of the sample
@@ -124,10 +209,10 @@
   const SmallVector<LBREntry, 16> &LBRStack;
   // Used to iterate the address range
   InstructionPointer InstPtr;
-  UnwindState(const HybridSample &Sample)
-      : Binary(Sample.Binary), CallStack(Sample.CallStack),
-        LBRStack(Sample.LBRStack),
-        InstPtr(Sample.Binary, Sample.CallStack.front()) {}
+  UnwindState(const HybridSample *Sample)
+      : Binary(Sample->Binary), CallStack(Sample->CallStack),
+        LBRStack(Sample->LBRStack),
+        InstPtr(Sample->Binary, Sample->CallStack.front()) {}
 
   bool validateInitialState() {
     uint64_t LBRLeaf = LBRStack[LBRIndex].Target;
@@ -160,56 +245,61 @@
   void advanceLBR() { LBRIndex++; }
 };
 
+// Base class for sample counter key with context
+struct ContextKey {
+  uint64_t HashCode = 0;
+  virtual ~ContextKey() = default;
+  uint64_t getHashCode() const { return HashCode; }
+  virtual bool isEqual(const ContextKey *K) const {
+    return HashCode == K->HashCode;
+  };
+
+  // Utilities for LLVM-style RTTI
+  enum ContextKind { CK_StringBased };
+  const ContextKind Kind;
+  ContextKind getKind() const { return Kind; }
+  ContextKey(ContextKind K) : Kind(K){};
+};
+
+// String based context id
+struct StringBasedCtxKey : public ContextKey {
+  std::string Context;
+  StringBasedCtxKey() : ContextKey(CK_StringBased){};
+  static bool classof(const ContextKey *K) {
+    return K->getKind() == CK_StringBased;
+  }
+
+  bool isEqual(const ContextKey *K) const override {
+    const StringBasedCtxKey *Other = dyn_cast<StringBasedCtxKey>(K);
+    return Context == Other->Context;
+  }
+
+  uint64_t genHashCode() { return hash_value(Context); }
+};
+
 // The counter of branch samples for one function indexed by the branch,
 // which is represented as the source and target offset pair.
 using BranchSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
 // The counter of range samples for one function indexed by the range,
 // which is represented as the start and end offset pair.
 using RangeSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
-// Range sample counters indexed by the context string
-using ContextRangeCounter = std::unordered_map<std::string, RangeSample>;
-// Branch sample counters indexed by the context string
-using ContextBranchCounter = std::unordered_map<std::string, BranchSample>;
-
-// For Hybrid sample counters
-struct ContextSampleCounters {
-  ContextRangeCounter RangeCounter;
-  ContextBranchCounter BranchCounter;
-
-  void recordRangeCount(std::string &ContextId, uint64_t Start, uint64_t End,
-                        uint64_t Repeat) {
-    RangeCounter[ContextId][{Start, End}] += Repeat;
-  }
-  void recordBranchCount(std::string &ContextId, uint64_t Source,
-                         uint64_t Target, uint64_t Repeat) {
-    BranchCounter[ContextId][{Source, Target}] += Repeat;
-  }
-};
+// Wrapper for sample counters including range counter and branch counter
+struct SampleCounter {
+  RangeSample RangeCounter;
+  BranchSample BranchCounter;
 
-struct HybridSampleHash {
-  uint64_t hashCombine(uint64_t Hash, uint64_t Value) const {
-    // Simple DJB2 hash
-    return ((Hash << 5) + Hash) + Value;
+  void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Repeat) {
+    RangeCounter[{Start, End}] += Repeat;
   }
-
-  uint64_t operator()(const HybridSample &Sample) const {
-    uint64_t Hash = 5381;
-    Hash = hashCombine(Hash, reinterpret_cast<uint64_t>(Sample.Binary));
-    for (const auto &Value : Sample.CallStack) {
-      Hash = hashCombine(Hash, Value);
-    }
-    for (const auto &Entry : Sample.LBRStack) {
-      Hash = hashCombine(Hash, Entry.Source);
-      Hash = hashCombine(Hash, Entry.Target);
-    }
-    return Hash;
+  void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Repeat) {
+    BranchCounter[{Source, Target}] += Repeat;
   }
 };
 
-// After parsing the sample, we record the samples by aggregating them
-// into this structure and the value is the sample counter.
-using AggregationCounter =
-    std::unordered_map<HybridSample, uint64_t, HybridSampleHash>;
+// Sample counter with context to support context-sensitive profile
+using ContextSampleCounter =
+    std::unordered_map<Hashable<ContextKey>, SampleCounter,
+                       Hashable<ContextKey>::Hash, Hashable<ContextKey>::Equal>;
 
 /*
 As in hybrid sample we have a group of LBRs and the most recent sampling call
@@ -232,7 +322,7 @@
 */
 class VirtualUnwinder {
 public:
-  VirtualUnwinder(ContextSampleCounters *Counters) : SampleCounters(Counters) {}
+  VirtualUnwinder(ContextSampleCounter *Counter) : CxtSampleCounter(Counter) {}
 
   bool isCallState(UnwindState &State) const {
     // The tail call frame is always missing here in stack sample, we will
@@ -250,14 +340,16 @@
   void unwindLinear(UnwindState &State, uint64_t Repeat);
   void unwindReturn(UnwindState &State);
   void unwindBranchWithinFrame(UnwindState &State);
-  bool unwind(const HybridSample &Sample, uint64_t Repeat);
+  bool unwind(const HybridSample *Sample, uint64_t Repeat);
   void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State,
                         uint64_t Repeat);
   void recordBranchCount(const LBREntry &Branch, UnwindState &State,
                          uint64_t Repeat);
+  SampleCounter &getSampleCounter(const ProfiledBinary *Binary,
+                                  std::list<uint64_t> &CallStack);
 
 private:
-  ContextSampleCounters *SampleCounters;
+  ContextSampleCounter *CxtSampleCounter;
 };
 
 // Filename to binary map
@@ -268,7 +360,7 @@
 // same binary loaded at different addresses, they should share the same sample
 // counter
 using BinarySampleCounterMap =
-    std::unordered_map<ProfiledBinary *, ContextSampleCounters>;
+    std::unordered_map<ProfiledBinary *, ContextSampleCounter>;
 
 // Load binaries and read perf trace to parse the events and samples
 class PerfReader {
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -72,26 +72,38 @@
   State.InstPtr.update(Source);
 }
 
+SampleCounter &
+VirtualUnwinder::getSampleCounter(const ProfiledBinary *Binary,
+                                  std::list<uint64_t> &CallStack) {
+  std::shared_ptr<StringBasedCtxKey> KeyStr =
+      std::make_shared<StringBasedCtxKey>();
+  KeyStr->Context = Binary->getExpandedContextStr(CallStack, true);
+  Hashable<ContextKey> ContextId(KeyStr);
+  auto Ret = CxtSampleCounter->emplace(ContextId, SampleCounter());
+  return Ret.first->second;
+}
+
 void VirtualUnwinder::recordRangeCount(uint64_t Start, uint64_t End,
                                        UnwindState &State, uint64_t Repeat) {
-  std::string &&ContextId = State.getExpandedContextStr();
   uint64_t StartOffset = State.getBinary()->virtualAddrToOffset(Start);
   uint64_t EndOffset = State.getBinary()->virtualAddrToOffset(End);
-  SampleCounters->recordRangeCount(ContextId, StartOffset, EndOffset, Repeat);
+  SampleCounter &SCounter =
+      getSampleCounter(State.getBinary(), State.CallStack);
+  SCounter.recordRangeCount(StartOffset, EndOffset, Repeat);
 }
 
 void VirtualUnwinder::recordBranchCount(const LBREntry &Branch,
                                         UnwindState &State, uint64_t Repeat) {
   if (Branch.IsArtificial)
     return;
-  std::string &&ContextId = State.getExpandedContextStr();
   uint64_t SourceOffset = State.getBinary()->virtualAddrToOffset(Branch.Source);
   uint64_t TargetOffset = State.getBinary()->virtualAddrToOffset(Branch.Target);
-  SampleCounters->recordBranchCount(ContextId, SourceOffset, TargetOffset,
-                                    Repeat);
+  SampleCounter &SCounter =
+      getSampleCounter(State.getBinary(), State.CallStack);
+  SCounter.recordBranchCount(SourceOffset, TargetOffset, Repeat);
 }
 
-bool VirtualUnwinder::unwind(const HybridSample &Sample, uint64_t Repeat) {
+bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) {
   // Capture initial state as starting point for unwinding.
   UnwindState State(Sample);
 
@@ -198,10 +210,24 @@
   return Iter->second;
 }
 
-static void printSampleCounter(ContextRangeCounter &Counter) {
-  for (auto Range : Counter) {
-    outs() << Range.first << "\n";
-    for (auto I : Range.second) {
+static void printRangeCounter(ContextSampleCounter &Counter) {
+  for (auto &CI : Counter) {
+    const StringBasedCtxKey *CtxKey =
+        dyn_cast<StringBasedCtxKey>(CI.first.getPtr());
+    outs() << CtxKey->Context << "\n";
+    for (auto I : CI.second.RangeCounter) {
+      outs() << "  (" << format("%" PRIx64, I.first.first) << ", "
+             << format("%" PRIx64, I.first.second) << "): " << I.second << "\n";
+    }
+  }
+}
+
+static void printBranchCounter(ContextSampleCounter &Counter) {
+  for (auto &CI : Counter) {
+    const StringBasedCtxKey *CtxKey =
+        dyn_cast<StringBasedCtxKey>(CI.first.getPtr());
+    outs() << CtxKey->Context << "\n";
+    for (auto I : CI.second.BranchCounter) {
       outs() << "  (" << format("%" PRIx64, I.first.first) << ", "
              << format("%" PRIx64, I.first.second) << "): " << I.second << "\n";
     }
@@ -212,16 +238,16 @@
   for (auto I : BinarySampleCounters) {
     const ProfiledBinary *Binary = I.first;
     outs() << "Binary(" << Binary->getName().str() << ")'s Range Counter:\n";
-    printSampleCounter(I.second.RangeCounter);
+    printRangeCounter(I.second);
     outs() << "\nBinary(" << Binary->getName().str() << ")'s Branch Counter:\n";
-    printSampleCounter(I.second.BranchCounter);
+    printBranchCounter(I.second);
   }
 }
 
 void PerfReader::unwindSamples() {
   for (const auto &Item : AggregatedSamples) {
-    const HybridSample &Sample = Item.first;
-    VirtualUnwinder Unwinder(&BinarySampleCounters[Sample.Binary]);
+    const HybridSample *Sample = dyn_cast<HybridSample>(Item.first.getPtr());
+    VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary]);
     Unwinder.unwind(Sample, Item.second);
   }
 
@@ -363,26 +389,28 @@
   // 0x4005c8/0x4005dc/P/-/-/0   0x40062f/0x4005b0/P/-/-/0 ...
   //          ... 0x4005c8/0x4005dc/P/-/-/0    # LBR Entries
   //
-  HybridSample Sample;
+  std::shared_ptr<HybridSample> Sample = std::make_shared<HybridSample>();
 
   // Parsing call stack and populate into HybridSample.CallStack
-  if (!extractCallstack(TraceIt, Sample.CallStack)) {
+  if (!extractCallstack(TraceIt, Sample->CallStack)) {
     // Skip the next LBR line matched current call stack
     if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x"))
       TraceIt.advance();
     return;
   }
   // Set the binary current sample belongs to
-  Sample.Binary = getBinary(Sample.CallStack.front());
+  Sample->Binary = getBinary(Sample->CallStack.front());
 
   if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) {
     // Parsing LBR stack and populate into HybridSample.LBRStack
-    if (extractLBRStack(TraceIt, Sample.LBRStack, Sample.Binary)) {
+    if (extractLBRStack(TraceIt, Sample->LBRStack, Sample->Binary)) {
       // Canonicalize stack leaf to avoid 'random' IP from leaf frame skew LBR
       // ranges
-      Sample.CallStack.front() = Sample.LBRStack[0].Target;
+      Sample->CallStack.front() = Sample->LBRStack[0].Target;
       // Record samples by aggregation
-      AggregatedSamples[Sample]++;
+      Sample->genHashCode();
+      Hashable<PerfSample> HashKey(Sample);
+      AggregatedSamples[HashKey]++;
     }
   } else {
     // LBR sample is encoded in single line after stack sample
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
--- a/llvm/tools/llvm-profgen/ProfileGenerator.h
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -64,12 +64,24 @@
 
 public:
   void generateProfile() override {
-    // Fill in function body samples
-    populateFunctionBodySamples();
-
-    // Fill in boundary sample counts as well as call site samples for calls
-    populateFunctionBoundarySamples();
-
+    for (const auto &BI : BinarySampleCounters) {
+      ProfiledBinary *Binary = BI.first;
+      for (const auto &CI : BI.second) {
+        const StringBasedCtxKey *CtxKey =
+            dyn_cast<StringBasedCtxKey>(CI.first.getPtr());
+        StringRef ContextId(CtxKey->Context);
+        // Get or create function profile for the range
+        FunctionSamples &FunctionProfile =
+            getFunctionProfileForContext(ContextId);
+
+        // Fill in function body samples
+        populateFunctionBodySamples(FunctionProfile, CI.second.RangeCounter,
+                                    Binary);
+        // Fill in boundary sample counts as well as call site samples for calls
+        populateFunctionBoundarySamples(ContextId, FunctionProfile,
+                                        CI.second.BranchCounter, Binary);
+      }
+    }
     // Fill in call site value sample for inlined calls and also use context to
     // infer missing samples. Since we don't have call count for inlined
     // functions, we estimate it from inlinee's profile using the entry of the
@@ -85,8 +97,13 @@
                                            uint64_t Count);
   // Lookup or create FunctionSamples for the context
   FunctionSamples &getFunctionProfileForContext(StringRef ContextId);
-  void populateFunctionBodySamples();
-  void populateFunctionBoundarySamples();
+  void populateFunctionBodySamples(FunctionSamples &FunctionProfile,
+                                   const RangeSample &RangeCounters,
+                                   ProfiledBinary *Binary);
+  void populateFunctionBoundarySamples(StringRef ContextId,
+                                       FunctionSamples &FunctionProfile,
+                                       const BranchSample &BranchCounters,
+                                       ProfiledBinary *Binary);
   void populateInferredFunctionSamples();
 };
 
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -179,95 +179,76 @@
   }
 }
 
-void CSProfileGenerator::populateFunctionBodySamples() {
-  for (const auto &BI : BinarySampleCounters) {
-    ProfiledBinary *Binary = BI.first;
-    for (const auto &CI : BI.second.RangeCounter) {
-      StringRef ContextId(CI.first);
-      // Get or create function profile for the range
-      FunctionSamples &FunctionProfile =
-          getFunctionProfileForContext(ContextId);
-      // Compute disjoint ranges first, so we can use MAX
-      // for calculating count for each location.
-      RangeSample Ranges;
-      findDisjointRanges(Ranges, CI.second);
-
-      for (auto Range : Ranges) {
-        uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first);
-        uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second);
-        uint64_t Count = Range.second;
-        // Disjoint ranges have introduce zero-filled gap that
-        // doesn't belong to current context, filter them out.
-        if (Count == 0)
-          continue;
-
-        InstructionPointer IP(Binary, RangeBegin, true);
-
-        // Disjoint ranges may have range in the middle of two instr,
-        // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
-        // can be Addr1+1 to Addr2-1. We should ignore such range.
-        if (IP.Address > RangeEnd)
-          continue;
-
-        while (IP.Address <= RangeEnd) {
-          uint64_t Offset = Binary->virtualAddrToOffset(IP.Address);
-          const FrameLocation &LeafLoc = Binary->getInlineLeafFrameLoc(Offset);
-          // Recording body sample for this specific context
-          updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc, Count);
-          // Move to next IP within the range
-          IP.advance();
-        }
-      }
+void CSProfileGenerator::populateFunctionBodySamples(
+    FunctionSamples &FunctionProfile, const RangeSample &RangeCounter,
+    ProfiledBinary *Binary) {
+  // Compute disjoint ranges first, so we can use MAX
+  // for calculating count for each location.
+  RangeSample Ranges;
+  findDisjointRanges(Ranges, RangeCounter);
+  for (auto Range : Ranges) {
+    uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first);
+    uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second);
+    uint64_t Count = Range.second;
+    // Disjoint ranges have introduce zero-filled gap that
+    // doesn't belong to current context, filter them out.
+    if (Count == 0)
+      continue;
+
+    InstructionPointer IP(Binary, RangeBegin, true);
+
+    // Disjoint ranges may have range in the middle of two instr,
+    // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
+    // can be Addr1+1 to Addr2-1. We should ignore such range.
+    if (IP.Address > RangeEnd)
+      continue;
+
+    while (IP.Address <= RangeEnd) {
+      uint64_t Offset = Binary->virtualAddrToOffset(IP.Address);
+      const FrameLocation &LeafLoc = Binary->getInlineLeafFrameLoc(Offset);
+      // Recording body sample for this specific context
+      updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc, Count);
+      // Move to next IP within the range
+      IP.advance();
     }
   }
 }
 
-void CSProfileGenerator::populateFunctionBoundarySamples() {
-  for (const auto &BI : BinarySampleCounters) {
-    ProfiledBinary *Binary = BI.first;
-    for (const auto &CI : BI.second.BranchCounter) {
-      StringRef ContextId(CI.first);
-      // Get or create function profile for branch Source
-      FunctionSamples &FunctionProfile =
-          getFunctionProfileForContext(ContextId);
-
-      for (auto Entry : CI.second) {
-        uint64_t SourceOffset = Entry.first.first;
-        uint64_t TargetOffset = Entry.first.second;
-        uint64_t Count = Entry.second;
-        // Get the callee name by branch target if it's a call branch
-        StringRef CalleeName = FunctionSamples::getCanonicalFnName(
-            Binary->getFuncFromStartOffset(TargetOffset));
-        if (CalleeName.size() == 0)
-          continue;
-
-        // Record called target sample and its count
-        const FrameLocation &LeafLoc =
-            Binary->getInlineLeafFrameLoc(SourceOffset);
-
-        FunctionProfile.addCalledTargetSamples(LeafLoc.second.LineOffset,
-                                               LeafLoc.second.Discriminator,
-                                               CalleeName, Count);
-        FunctionProfile.addTotalSamples(Count);
-
-        // Record head sample for called target(callee)
-        // TODO: Cleanup ' @ '
-        std::string CalleeContextId =
-            getCallSite(LeafLoc) + " @ " + CalleeName.str();
-        if (ContextId.find(" @ ") != StringRef::npos) {
-          CalleeContextId =
-              ContextId.rsplit(" @ ").first.str() + " @ " + CalleeContextId;
-        }
-
-        if (ProfileMap.find(CalleeContextId) != ProfileMap.end()) {
-          FunctionSamples &CalleeProfile = ProfileMap[CalleeContextId];
-          assert(Count != 0 && "Unexpected zero weight branch");
-          if (CalleeProfile.getName().size()) {
-            CalleeProfile.addHeadSamples(Count);
-          }
-        }
-      }
+void CSProfileGenerator::populateFunctionBoundarySamples(
+    StringRef ContextId, FunctionSamples &FunctionProfile,
+    const BranchSample &BranchCounters, ProfiledBinary *Binary) {
+
+  for (auto Entry : BranchCounters) {
+    uint64_t SourceOffset = Entry.first.first;
+    uint64_t TargetOffset = Entry.first.second;
+    uint64_t Count = Entry.second;
+    // Get the callee name by branch target if it's a call branch
+    StringRef CalleeName = FunctionSamples::getCanonicalFnName(
+        Binary->getFuncFromStartOffset(TargetOffset));
+    if (CalleeName.size() == 0)
+      continue;
+
+    // Record called target sample and its count
+    const FrameLocation &LeafLoc = Binary->getInlineLeafFrameLoc(SourceOffset);
+
+    FunctionProfile.addCalledTargetSamples(LeafLoc.second.LineOffset,
+                                           LeafLoc.second.Discriminator,
+                                           CalleeName, Count);
+    FunctionProfile.addTotalSamples(Count);
+
+    // Record head sample for called target(callee)
+    // TODO: Cleanup ' @ '
+    std::string CalleeContextId =
+        getCallSite(LeafLoc) + " @ " + CalleeName.str();
+    if (ContextId.find(" @ ") != StringRef::npos) {
+      CalleeContextId =
+          ContextId.rsplit(" @ ").first.str() + " @ " + CalleeContextId;
     }
+
+    FunctionSamples &CalleeProfile =
+        getFunctionProfileForContext(CalleeContextId);
+    assert(Count != 0 && "Unexpected zero weight branch");
+    CalleeProfile.addHeadSamples(Count);
   }
 }