diff --git a/llvm/docs/CommandGuide/llvm-profgen.rst b/llvm/docs/CommandGuide/llvm-profgen.rst
--- a/llvm/docs/CommandGuide/llvm-profgen.rst
+++ b/llvm/docs/CommandGuide/llvm-profgen.rst
@@ -36,6 +36,12 @@
 -------
 :program:`llvm-profgen` supports the following options:
 
+.. option:: --format=[text|binary|extbinary|compbinary|gcc]
+
+  Specify the format of the generated profile. Supported <format>  are `text`,
+  `binary`, `extbinary`, `compbinary`, `gcc`, see `llvm-profdata` for more
+  descriptions of the format.
+
 .. option:: --show-mmap-events
 
   Print mmap events.
diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -246,6 +246,10 @@
     return LineOffset == O.LineOffset && Discriminator == O.Discriminator;
   }
 
+  bool operator!=(const LineLocation &O) const {
+    return LineOffset != O.LineOffset || Discriminator != O.Discriminator;
+  }
+
   uint32_t LineOffset;
   uint32_t Discriminator;
 };
@@ -585,6 +589,11 @@
   /// Return the sample count of the first instruction of the function.
   /// The function can be either a standalone symbol or an inlined function.
   uint64_t getEntrySamples() const {
+    if (FunctionSamples::ProfileIsCS && getHeadSamples()) {
+      // For CS profile, if we already have more accurate head samples
+      // counted by branch sample from caller, use them as entry samples.
+      return getHeadSamples();
+    }
     uint64_t Count = 0;
     // Use either BodySamples or CallsiteSamples which ever has the smaller
     // lineno.
@@ -680,19 +689,28 @@
   /// Return the function name.
   StringRef getName() const { return Name; }
 
+  /// Return function name with context.
+  StringRef getNameWithContext() const {
+    return FunctionSamples::ProfileIsCS ? Context.getNameWithContext() : Name;
+  }
+
   /// Return the original function name.
   StringRef getFuncName() const { return getFuncName(Name); }
 
   /// Return the canonical name for a function, taking into account
   /// suffix elision policy attributes.
   static StringRef getCanonicalFnName(const Function &F) {
-    static const char *knownSuffixes[] = { ".llvm.", ".part." };
     auto AttrName = "sample-profile-suffix-elision-policy";
     auto Attr = F.getFnAttribute(AttrName).getValueAsString();
+    return getCanonicalFnName(F.getName(), Attr);
+  }
+
+  static StringRef getCanonicalFnName(StringRef FnName, StringRef Attr = "") {
+    static const char *knownSuffixes[] = { ".llvm.", ".part." };
     if (Attr == "" || Attr == "all") {
-      return F.getName().split('.').first;
+      return FnName.split('.').first;
     } else if (Attr == "selected") {
-      StringRef Cand(F.getName());
+      StringRef Cand(FnName);
       for (const auto &Suf : knownSuffixes) {
         StringRef Suffix(Suf);
         auto It = Cand.rfind(Suffix);
@@ -704,11 +722,11 @@
       }
       return Cand;
     } else if (Attr == "none") {
-      return F.getName();
+      return FnName;
     } else {
       assert(false && "internal error: unknown suffix elision policy");
     }
-    return F.getName();
+    return FnName;
   }
 
   /// Translate \p Name into its original name.
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -276,7 +276,10 @@
 /// it needs to be parsed by the SampleProfileReaderText class.
 std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) {
   auto &OS = *OutputStream;
-  OS << S.getName() << ":" << S.getTotalSamples();
+  if (FunctionSamples::ProfileIsCS)
+    OS << "[" << S.getNameWithContext() << "]:" << S.getTotalSamples();
+  else
+    OS << S.getName() << ":" << S.getTotalSamples();
   if (Indent == 0)
     OS << ":" << S.getHeadSamples();
   OS << "\n";
diff --git a/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfbin b/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfbin
new file mode 100755
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001

literal 0
Hc$@<O00001

diff --git a/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript b/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript
@@ -0,0 +1,7 @@
+Using perf wrapper that supports hot-text. Try perf.real if you encounter any issues.
+PERF_RECORD_MMAP2 2854748/2854748: [0x400000(0x1000) @ 0 00:1d 123291722 526021]: r-xp /home/inline-cs-noprobe.perfbin
+
+
+	          40067e
+	5541f689495641d7
+ 0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x40069b/0x400670/M/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0
diff --git a/llvm/test/tools/llvm-profgen/Inputs/noinline-cs-noprobe.perfbin b/llvm/test/tools/llvm-profgen/Inputs/noinline-cs-noprobe.perfbin
new file mode 100755
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001

literal 0
Hc$@<O00001

diff --git a/llvm/test/tools/llvm-profgen/Inputs/noinline-cs-noprobe.perfscript b/llvm/test/tools/llvm-profgen/Inputs/noinline-cs-noprobe.perfscript
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/noinline-cs-noprobe.perfscript
@@ -0,0 +1,24 @@
+Using perf wrapper that supports hot-text. Try perf.real if you encounter any issues.
+PERF_RECORD_MMAP2 2854748/2854748: [0x400000(0x1000) @ 0 00:1d 123291722 526021]: r-xp /home/noinline-cs-noprobe.perfbin
+
+	          4005dc
+	          400634
+	          400684
+	    7f68c5788793
+ 0x4005c8/0x4005dc/P/-/-/0  0x40062f/0x4005b0/P/-/-/0  0x400645/0x4005ff/P/-/-/0  0x400637/0x400645/P/-/-/0  0x4005e9/0x400634/P/-/-/0  0x4005d7/0x4005e5/P/-/-/0  0x40062f/0x4005b0/P/-/-/0  0x400645/0x4005ff/P/-/-/0  0x400637/0x400645/P/-/-/0  0x4005e9/0x400634/P/-/-/0  0x4005d7/0x4005e5/P/-/-/0  0x40062f/0x4005b0/P/-/-/0  0x400645/0x4005ff/P/-/-/0  0x400637/0x400645/P/-/-/0  0x4005e9/0x400634/P/-/-/0  0x4005c8/0x4005dc/P/-/-/0
+
+// Test for leaf frame ending up in prolog
+	          4005b0
+	          400684
+	    7f68c5788793
+ 0x40062f/0x4005b0/P/-/-/0  0x400645/0x4005ff/P/-/-/0  0x400637/0x400645/P/-/-/0  0x4005e9/0x400634/P/-/-/0  0x4005c8/0x4005dc/P/-/-/0  0x40062f/0x4005b0/P/-/-/0  0x400645/0x4005ff/P/-/-/0  0x400637/0x400645/P/-/-/0  0x4005e9/0x400634/P/-/-/0  0x4005d7/0x4005e5/P/-/-/0  0x40062f/0x4005b0/P/-/-/0  0x400645/0x4005ff/P/-/-/0  0x400637/0x400645/P/-/-/0  0x4005e9/0x400634/P/-/-/0  0x4005d7/0x4005e5/P/-/-/0  0x40062f/0x4005b0/P/-/-/0
+
+// Call stack:
+//	          4005b0	-> 	start addr of bar
+//	          400684	->	address in main
+// LBR Entry:				    |   Source				|	Target
+// 0x40062f/0x4005b0/P/-/-/0	|   callq	-132 <bar>  |   start addr of bar
+// 0x400645/0x4005ff/P/-/-/0    |	jmp	-75 <foo+0xf>	|	movl	-8(%rbp), %eax
+// 0x400637/0x400645/P/-/-/0 	|	jmp	9 <foo+0x55>	|	jmp	-75 <foo+0xf>
+// 0x4005e9/0x400634/P/-/-/0	|	(bar)retq			|	next addr of [callq	-132 <bar>]
+// 0x4005d7/0x4005e5/P/-/-/0 	|	jmp	9 <bar+0x35>	|	movl	-4(%rbp), %eax
diff --git a/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test b/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test
@@ -0,0 +1,47 @@
+; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-noprobe.perfscript --binary=%S/Inputs/inline-cs-noprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
+; RUN: FileCheck %s --input-file %t
+
+; CHECK:[main:1 @ foo]:44:0
+; CHECK: 2.2: 14
+; CHECK: 3: 15
+; CHECK: 3.2: 14 bar:14
+; CHECK: 3.4: 1
+; CHECK:[main:1 @ foo:3.2 @ bar]:14:0
+; CHECK: 1: 14
+
+; CHECK-UNWINDER: Binary(inline-cs-noprobe.perfbin)'s Range Counter:
+; CHECK-UNWINDER: main:1 @ foo:3.2 @ bar
+; CHECK-UNWINDER:   (6af, 6bb): 14
+; CHECK-UNWINDER: main:1 @ foo
+; CHECK-UNWINDER:   (670, 6ad): 1
+; CHECK-UNWINDER:   (67e, 69b): 1
+; CHECK-UNWINDER:   (67e, 6ad): 13
+; CHECK-UNWINDER:   (6bd, 6c8): 14
+
+; CHECK-UNWINDER: Binary(inline-cs-noprobe.perfbin)'s Branch Counter:
+; CHECK-UNWINDER: main:1 @ foo
+; CHECK-UNWINDER:   (69b, 670): 1
+; CHECK-UNWINDER:   (6c8, 67e): 15
+
+; original code:
+; clang -O3 -g test.c -o a.out
+#include <stdio.h>
+
+int bar(int x, int y) {
+  if (x % 3) {
+    return x - y;
+  }
+  return x + y;
+}
+
+void foo() {
+  int s, i = 0;
+  while (i++ < 4000 * 4000)
+    if (i % 91) s = bar(i, s); else s += 30;
+  printf("sum is %d\n", s);
+}
+
+int main() {
+  foo();
+  return 0;
+}
diff --git a/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test b/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test
@@ -0,0 +1,60 @@
+; RUN: llvm-profgen --perfscript=%S/Inputs/noinline-cs-noprobe.perfscript --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
+; RUN: FileCheck %s --input-file %t
+
+; CHECK:[main:1 @ foo:3 @ bar]:12:3
+; CHECK: 0: 3
+; CHECK: 1: 3
+; CHECK: 2: 2
+; CHECK: 4: 1
+; CHECK: 5: 3
+; CHECK:[main:1 @ foo]:9:0
+; CHECK: 2: 3
+; CHECK: 3: 3 bar:3
+
+; CHECK-UNWINDER: Binary(noinline-cs-noprobe.perfbin)'s Range Counter:
+; CHECK-UNWINDER: main:1 @ foo
+; CHECK-UNWINDER:   (5ff, 62f): 3
+; CHECK-UNWINDER:   (634, 637): 3
+; CHECK-UNWINDER:   (645, 645): 3
+; CHECK-UNWINDER: main:1 @ foo:3 @ bar
+; CHECK-UNWINDER:   (5b0, 5c8): 1
+; CHECK-UNWINDER:   (5b0, 5d7): 2
+; CHECK-UNWINDER:   (5dc, 5e9): 1
+; CHECK-UNWINDER:   (5e5, 5e9): 2
+
+; CHECK-UNWINDER: Binary(noinline-cs-noprobe.perfbin)'s Branch Counter:
+; CHECK-UNWINDER: main:1 @ foo
+; CHECK-UNWINDER:   (62f, 5b0): 3
+; CHECK-UNWINDER:   (637, 645): 3
+; CHECK-UNWINDER:   (645, 5ff): 3
+; CHECK-UNWINDER: main:1 @ foo:3 @ bar
+; CHECK-UNWINDER:   (5c8, 5dc): 2
+; CHECK-UNWINDER:   (5d7, 5e5): 2
+; CHECK-UNWINDER:   (5e9, 634): 3
+
+
+
+
+
+; original code:
+; clang -O0 -g test.c -o a.out
+#include <stdio.h>
+
+int bar(int x, int y) {
+  if (x % 3) {
+    return x - y;
+  }
+  return x + y;
+}
+
+void foo() {
+  int s, i = 0;
+  while (i++ < 4000 * 4000)
+    if (i % 91) s = bar(i, s); else s += 30;
+  printf("sum is %d\n", s);
+}
+
+int main() {
+  foo();
+  return 0;
+}
diff --git a/llvm/tools/llvm-profgen/CMakeLists.txt b/llvm/tools/llvm-profgen/CMakeLists.txt
--- a/llvm/tools/llvm-profgen/CMakeLists.txt
+++ b/llvm/tools/llvm-profgen/CMakeLists.txt
@@ -7,6 +7,7 @@
   MC
   MCDisassembler
   Object
+  ProfileData
   Support
   Symbolize
   )
@@ -15,4 +16,5 @@
   llvm-profgen.cpp
   PerfReader.cpp
   ProfiledBinary.cpp
+  ProfileGenerator.cpp
   )
diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h
--- a/llvm/tools/llvm-profgen/PerfReader.h
+++ b/llvm/tools/llvm-profgen/PerfReader.h
@@ -56,16 +56,238 @@
   }
 };
 
+// The type of perfscript
+enum PerfScriptType {
+  PERF_INVILID = 0,
+  PERF_LBR = 1,       // Only LBR sample
+  PERF_LBR_STACK = 2, // Hybrid sample including call stack and LBR stack.
+};
+
+// The parsed LBR sample entry.
+struct LBREntry {
+  uint64_t Source = 0;
+  uint64_t Target = 0;
+  // An artificial branch stands for a series of consecutive branches starting
+  // from the current binary with a transition through external code and
+  // eventually landing back in the current binary.
+  bool IsArtificial = false;
+  LBREntry(uint64_t S, uint64_t T, bool I)
+      : Source(S), Target(T), IsArtificial(I) {}
+};
+
+// The parsed hybrid sample including call stack and LBR stack.
+struct HybridSample {
+  // Profiled binary that current frame address belongs to
+  ProfiledBinary *Binary;
+  // Call stack recorded in FILO(leaf to root) order
+  std::list<uint64_t> CallStack;
+  // LBR stack recorded in FIFO order
+  SmallVector<LBREntry, 16> LBRStack;
+
+  // Used for sample aggregation
+  bool operator==(const HybridSample &Other) const {
+    if (Other.Binary != Binary)
+      return false;
+    const std::list<uint64_t> &OtherCallStack = Other.CallStack;
+    const SmallVector<LBREntry, 16> &OtherLBRStack = Other.LBRStack;
+
+    if (CallStack.size() != OtherCallStack.size() ||
+        LBRStack.size() != OtherLBRStack.size())
+      return false;
+
+    auto Iter = CallStack.begin();
+    for (auto Address : OtherCallStack) {
+      if (Address != *Iter++)
+        return false;
+    }
+
+    for (size_t I = 0; I < OtherLBRStack.size(); I++) {
+      if (LBRStack[I].Source != OtherLBRStack[I].Source ||
+          LBRStack[I].Target != OtherLBRStack[I].Target)
+        return false;
+    }
+    return true;
+  }
+};
+
+// The state for the unwinder, it doesn't hold the data but only keep the
+// pointer/index of the data, While unwinding, the CallStack is changed
+// dynamicially and will be recorded as the context of the sample
+struct UnwindState {
+  // Profiled binary that current frame address belongs to
+  const ProfiledBinary *Binary;
+  // TODO: switch to use trie for call stack
+  std::list<uint64_t> CallStack;
+  // Used to fall through the LBR stack
+  uint32_t LBRIndex = 0;
+  // Reference to HybridSample.LBRStack
+  const SmallVector<LBREntry, 16> &LBRStack;
+  // Used to iterate the address range
+  InstructionPointer InstPtr;
+  UnwindState(const HybridSample &Sample)
+      : Binary(Sample.Binary), CallStack(Sample.CallStack),
+        LBRStack(Sample.LBRStack),
+        InstPtr(Sample.Binary, Sample.CallStack.front()) {}
+
+  bool validateInitialState() {
+    uint64_t LBRLeaf = LBRStack[LBRIndex].Target;
+    uint64_t StackLeaf = CallStack.front();
+    // When we take a stack sample, ideally the sampling distance between the
+    // leaf IP of stack and the last LBR target shouldn't be very large.
+    // Use a heuristic size (0x100) to filter out broken records.
+    if (StackLeaf < LBRLeaf || StackLeaf >= LBRLeaf + 0x100) {
+      WithColor::warning() << "Bogus trace: stack tip = "
+                           << format("%#010x", StackLeaf)
+                           << ", LBR tip = " << format("%#010x\n", LBRLeaf);
+      return false;
+    }
+    return true;
+  }
+
+  void checkStateConsistency() {
+    assert(InstPtr.Address == CallStack.front() &&
+           "IP should align with context leaf");
+  }
+
+  std::string getExpandedContextStr() const {
+    return Binary->getExpandedContextStr(CallStack);
+  }
+  const ProfiledBinary *getBinary() const { return Binary; }
+  bool hasNextLBR() const { return LBRIndex < LBRStack.size(); }
+  uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; }
+  uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; }
+  const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; }
+  void advanceLBR() { LBRIndex++; }
+};
+
+// The counter of branch samples for one function indexed by the branch,
+// which is represented as the source and target offset pair.
+using BranchSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
+// The counter of range samples for one function indexed by the range,
+// which is represented as the start and end offset pair.
+using RangeSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
+// Range sample counters indexed by the context string
+using ContextRangeCounter = std::unordered_map<std::string, RangeSample>;
+// Branch sample counters indexed by the context string
+using ContextBranchCounter = std::unordered_map<std::string, BranchSample>;
+
+// For Hybrid sample counters
+struct ContextSampleCounters {
+  ContextRangeCounter RangeCounter;
+  ContextBranchCounter BranchCounter;
+
+  void recordRangeCount(std::string &ContextId, uint64_t Start, uint64_t End,
+                        uint64_t Repeat) {
+    RangeCounter[ContextId][{Start, End}] += Repeat;
+  }
+  void recordBranchCount(std::string &ContextId, uint64_t Source,
+                         uint64_t Target, uint64_t Repeat) {
+    BranchCounter[ContextId][{Source, Target}] += Repeat;
+  }
+};
+
+struct HybridSampleHash {
+  uint64_t hashCombine(uint64_t Hash, uint64_t Value) const {
+    // Simple DJB2 hash
+    return ((Hash << 5) + Hash) + Value;
+  }
+
+  uint64_t operator()(const HybridSample &Sample) const {
+    uint64_t Hash = 5381;
+    Hash = hashCombine(Hash, reinterpret_cast<uint64_t>(Sample.Binary));
+    for (const auto &Value : Sample.CallStack) {
+      Hash = hashCombine(Hash, Value);
+    }
+    for (const auto &Entry : Sample.LBRStack) {
+      Hash = hashCombine(Hash, Entry.Source);
+      Hash = hashCombine(Hash, Entry.Target);
+    }
+    return Hash;
+  }
+};
+
+// After parsing the sample, we record the samples by aggregating them
+// into this structure and the value is the sample counter.
+using AggregationCounter =
+    std::unordered_map<HybridSample, uint64_t, HybridSampleHash>;
+
+/*
+As in hybrid sample we have a group of LBRs and the most recent sampling call
+stack, we can walk through those LBRs to infer more call stacks which would be
+used as context for profile. VirtualUnwinder is the class to do the call stack
+unwinding based on LBR state. Two types of unwinding are processd here:
+1) LBR unwinding and 2) linear range unwinding.
+Specifically, for each LBR entry(can be classified into call, return, regular
+branch), LBR unwinding will replay the operation by pushing, popping or
+switching leaf frame towards the call stack and since the initial call stack
+is most recently sampled, the replay should be in anti-execution order, i.e. for
+the regular case, pop the call stack when LBR is call, push frame on call stack
+when LBR is return. After each LBR processed, it also needs to align with the
+next LBR by going through instructions from previous LBR's target to current
+LBR's source, which is the linear unwinding. As instruction from linear range
+can come from different function by inlining, linear unwinding will do the range
+splitting and record counters by the range with same inline context. Over those
+unwinding process we will record each call stack as context id and LBR/linear
+range as sample counter for further CS profile generation.
+*/
+class VirtualUnwinder {
+public:
+  VirtualUnwinder(ContextSampleCounters *Counters) : SampleCounters(Counters) {}
+
+  bool isCallState(UnwindState &State) const {
+    // The tail call frame is always missing here in stack sample, we will
+    // use a specific tail call tracker to infer it.
+    return State.getBinary()->addressIsCall(State.getCurrentLBRSource());
+  }
+
+  bool isReturnState(UnwindState &State) const {
+    // Simply check addressIsReturn, as ret is always reliable, both for
+    // regular call and tail call.
+    return State.getBinary()->addressIsReturn(State.getCurrentLBRSource());
+  }
+
+  void unwindCall(UnwindState &State);
+  void unwindLinear(UnwindState &State, uint64_t Repeat);
+  void unwindReturn(UnwindState &State);
+  void unwindBranchWithinFrame(UnwindState &State);
+  bool unwind(const HybridSample &Sample, uint64_t Repeat);
+  void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State,
+                        uint64_t Repeat);
+  void recordBranchCount(const LBREntry &Branch, UnwindState &State,
+                         uint64_t Repeat);
+
+private:
+  ContextSampleCounters *SampleCounters;
+};
+
 // Filename to binary map
 using BinaryMap = StringMap<ProfiledBinary>;
 // Address to binary map for fast look-up
 using AddressBinaryMap = std::map<uint64_t, ProfiledBinary *>;
+// Binary to ContextSampleCounters Map to support multiple binary, we may have
+// same binary loaded at different addresses, they should share the same sample
+// counter
+using BinarySampleCounterMap =
+    std::unordered_map<ProfiledBinary *, ContextSampleCounters>;
 
 // Load binaries and read perf trace to parse the events and samples
 class PerfReader {
 
-  BinaryMap BinaryTable;
-  AddressBinaryMap AddrToBinaryMap; // Used by address-based lookup.
+public:
+  PerfReader(cl::list<std::string> &BinaryFilenames);
+
+  // Hybrid sample(call stack + LBRs) profile traces are seprated by double line
+  // break, search for that within the first 4k charactors to avoid going
+  // through the whole file.
+  static bool isHybridPerfScript(StringRef FileName) {
+    auto BufOrError = MemoryBuffer::getFileOrSTDIN(FileName, 4000);
+    if (!BufOrError)
+      exitWithError(BufOrError.getError(), FileName);
+    auto Buffer = std::move(BufOrError.get());
+    if (Buffer->getBuffer().find("\n\n") == StringRef::npos)
+      return false;
+    return true;
+  }
 
   // The parsed MMap event
   struct MMapEvent {
@@ -82,18 +304,48 @@
   ProfiledBinary &loadBinary(const StringRef BinaryPath,
                              bool AllowNameConflict = true);
   void updateBinaryAddress(const MMapEvent &Event);
+  PerfScriptType getPerfScriptType() const { return PerfType; }
+  // Entry of the reader to parse multiple perf traces
+  void parsePerfTraces(cl::list<std::string> &PerfTraceFilenames);
+  const BinarySampleCounterMap &getBinarySampleCounters() const {
+    return BinarySampleCounters;
+  }
 
-public:
-  PerfReader(cl::list<std::string> &BinaryFilenames);
-
+private:
   /// Parse a single line of a PERF_RECORD_MMAP2 event looking for a
   /// mapping between the binary name and its memory layout.
   ///
   void parseMMap2Event(TraceStream &TraceIt);
-  void parseEvent(TraceStream &TraceIt);
-  // Parse perf events and samples
-  void parseTrace(StringRef Filename);
-  void parsePerfTraces(cl::list<std::string> &PerfTraceFilenames);
+  // Parse perf events/samples and do aggregation
+  void parseAndAggregateTrace(StringRef Filename);
+  // Parse either an MMAP event or a perf sample
+  void parseEventOrSample(TraceStream &TraceIt);
+  // Parse the hybrid sample including the call and LBR line
+  void parseHybridSample(TraceStream &TraceIt);
+  // Extract call stack from the perf trace lines
+  bool extractCallstack(TraceStream &TraceIt, std::list<uint64_t> &CallStack);
+  // Extract LBR stack from one perf trace line
+  bool extractLBRStack(TraceStream &TraceIt,
+                       SmallVector<LBREntry, 16> &LBRStack,
+                       ProfiledBinary *Binary);
+  void checkAndSetPerfType(cl::list<std::string> &PerfTraceFilenames);
+  // Post process the profile after trace aggregation, we will do simple range
+  // overlap computation for AutoFDO, or unwind for CSSPGO(hybrid sample).
+  void generateRawProfile();
+  // Unwind the hybrid samples after aggregration
+  void unwindSamples();
+  void printUnwinderOutput();
+  // Helper function for looking up binary in AddressBinaryMap
+  ProfiledBinary *getBinary(uint64_t Address);
+
+  BinaryMap BinaryTable;
+  AddressBinaryMap AddrToBinaryMap; // Used by address-based lookup.
+
+private:
+  BinarySampleCounterMap BinarySampleCounters;
+  // Samples with the repeating time generated by the perf reader
+  AggregationCounter AggregatedSamples;
+  PerfScriptType PerfType;
 };
 
 } // end namespace sampleprof
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -11,9 +11,136 @@
                                     cl::init(false), cl::ZeroOrMore,
                                     cl::desc("Print binary load events."));
 
+static cl::opt<bool> ShowUnwinderOutput("show-unwinder-output",
+                                        cl::ReallyHidden, cl::init(false),
+                                        cl::ZeroOrMore,
+                                        cl::desc("Print unwinder output"));
+
 namespace llvm {
 namespace sampleprof {
 
+void VirtualUnwinder::unwindCall(UnwindState &State) {
+  // The 2nd frame after leaf could be missing if stack sample is
+  // taken when IP is within prolog/epilog, as frame chain isn't
+  // setup yet. Fill in the missing frame in that case.
+  // TODO: Currently we just assume all the addr that can't match the
+  // 2nd frame is in prolog/epilog. In the future, we will switch to
+  // pro/epi tracker(Dwarf CFI) for the precise check.
+  uint64_t Source = State.getCurrentLBRSource();
+  auto Iter = State.CallStack.begin();
+  if (State.CallStack.size() == 1 || *(++Iter) != Source) {
+    State.CallStack.front() = Source;
+  } else {
+    State.CallStack.pop_front();
+  }
+  State.InstPtr.update(Source);
+}
+
+void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) {
+  InstructionPointer &IP = State.InstPtr;
+  uint64_t Target = State.getCurrentLBRTarget();
+  uint64_t End = IP.Address;
+  // Unwind linear execution part
+  while (IP.Address >= Target) {
+    uint64_t PrevIP = IP.Address;
+    IP.backward();
+    // Break into segments for implicit call/return due to inlining
+    bool SameInlinee =
+        State.getBinary()->inlineContextEqual(PrevIP, IP.Address);
+    if (!SameInlinee || PrevIP == Target) {
+      recordRangeCount(PrevIP, End, State, Repeat);
+      End = IP.Address;
+    }
+    State.CallStack.front() = IP.Address;
+  }
+}
+
+void VirtualUnwinder::unwindReturn(UnwindState &State) {
+  // Add extra frame as we unwind through the return
+  const LBREntry &LBR = State.getCurrentLBR();
+  uint64_t CallAddr = State.getBinary()->getCallAddrFromFrameAddr(LBR.Target);
+  State.CallStack.front() = CallAddr;
+  State.CallStack.push_front(LBR.Source);
+  State.InstPtr.update(LBR.Source);
+}
+
+void VirtualUnwinder::unwindBranchWithinFrame(UnwindState &State) {
+  // TODO: Tolerate tail call for now, as we may see tail call from libraries.
+  // This is only for intra function branches, excluding tail calls.
+  uint64_t Source = State.getCurrentLBRSource();
+  State.CallStack.front() = Source;
+  State.InstPtr.update(Source);
+}
+
+void VirtualUnwinder::recordRangeCount(uint64_t Start, uint64_t End,
+                                       UnwindState &State, uint64_t Repeat) {
+  std::string &&ContextId = State.getExpandedContextStr();
+  uint64_t StartOffset = State.getBinary()->virtualAddrToOffset(Start);
+  uint64_t EndOffset = State.getBinary()->virtualAddrToOffset(End);
+  SampleCounters->recordRangeCount(ContextId, StartOffset, EndOffset, Repeat);
+}
+
+void VirtualUnwinder::recordBranchCount(const LBREntry &Branch,
+                                        UnwindState &State, uint64_t Repeat) {
+  if (Branch.IsArtificial)
+    return;
+  std::string &&ContextId = State.getExpandedContextStr();
+  uint64_t SourceOffset = State.getBinary()->virtualAddrToOffset(Branch.Source);
+  uint64_t TargetOffset = State.getBinary()->virtualAddrToOffset(Branch.Target);
+  SampleCounters->recordBranchCount(ContextId, SourceOffset, TargetOffset,
+                                    Repeat);
+}
+
+bool VirtualUnwinder::unwind(const HybridSample &Sample, uint64_t Repeat) {
+  // Capture initial state as starting point for unwinding.
+  UnwindState State(Sample);
+
+  // Sanity check - making sure leaf of LBR aligns with leaf of stack sample
+  // Stack sample sometimes can be unreliable, so filter out bogus ones.
+  if (!State.validateInitialState())
+    return false;
+
+  // Also do not attempt linear unwind for the leaf range as it's incomplete.
+  bool IsLeaf = true;
+
+  // Now process the LBR samples in parrallel with stack sample
+  // Note that we do not reverse the LBR entry order so we can
+  // unwind the sample stack as we walk through LBR entries.
+  while (State.hasNextLBR()) {
+    State.checkStateConsistency();
+
+    // Unwind implicit calls/returns from inlining, along the linear path,
+    // break into smaller sub section each with its own calling context.
+    if (!IsLeaf) {
+      unwindLinear(State, Repeat);
+    }
+    IsLeaf = false;
+
+    // Save the LBR branch before it gets unwound.
+    const LBREntry &Branch = State.getCurrentLBR();
+
+    if (isCallState(State)) {
+      // Unwind calls - we know we encountered call if LBR overlaps with
+      // transition between leaf the 2nd frame. Note that for calls that
+      // were not in the original stack sample, we should have added the
+      // extra frame when processing the return paired with this call.
+      unwindCall(State);
+    } else if (isReturnState(State)) {
+      // Unwind returns - check whether the IP is indeed at a return instruction
+      unwindReturn(State);
+    } else {
+      // Unwind branches - for regular intra function branches, we only
+      // need to record branch with context.
+      unwindBranchWithinFrame(State);
+    }
+    State.advanceLBR();
+    // Record `branch` with calling context after unwinding.
+    recordBranchCount(Branch, State, Repeat);
+  }
+
+  return true;
+}
+
 PerfReader::PerfReader(cl::list<std::string> &BinaryFilenames) {
   // Load the binaries.
   for (auto Filename : BinaryFilenames)
@@ -61,6 +188,208 @@
   Binary.setBaseAddress(Event.BaseAddress);
 }
 
+ProfiledBinary *PerfReader::getBinary(uint64_t Address) {
+  auto Iter = AddrToBinaryMap.lower_bound(Address);
+  if (Iter == AddrToBinaryMap.end() || Iter->first != Address) {
+    if (Iter == AddrToBinaryMap.begin())
+      return nullptr;
+    Iter--;
+  }
+  return Iter->second;
+}
+
+static void printSampleCounter(ContextRangeCounter &Counter) {
+  for (auto Range : Counter) {
+    outs() << Range.first << "\n";
+    for (auto I : Range.second) {
+      outs() << "  (" << format("%" PRIx64, I.first.first) << ", "
+             << format("%" PRIx64, I.first.second) << "): " << I.second << "\n";
+    }
+  }
+}
+
+void PerfReader::printUnwinderOutput() {
+  for (auto I : BinarySampleCounters) {
+    const ProfiledBinary *Binary = I.first;
+    outs() << "Binary(" << Binary->getName().str() << ")'s Range Counter:\n";
+    printSampleCounter(I.second.RangeCounter);
+    outs() << "\nBinary(" << Binary->getName().str() << ")'s Branch Counter:\n";
+    printSampleCounter(I.second.BranchCounter);
+  }
+}
+
+void PerfReader::unwindSamples() {
+  for (const auto &Item : AggregatedSamples) {
+    const HybridSample &Sample = Item.first;
+    VirtualUnwinder Unwinder(&BinarySampleCounters[Sample.Binary]);
+    Unwinder.unwind(Sample, Item.second);
+  }
+
+  if (ShowUnwinderOutput)
+    printUnwinderOutput();
+}
+
+bool PerfReader::extractLBRStack(TraceStream &TraceIt,
+                                 SmallVector<LBREntry, 16> &LBRStack,
+                                 ProfiledBinary *Binary) {
+  // The raw format of LBR stack is like:
+  // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ...
+  //                           ... 0x4005c8/0x4005dc/P/-/-/0
+  // It's in FIFO order and seperated by whitespace.
+  SmallVector<StringRef, 32> Records;
+  TraceIt.getCurrentLine().split(Records, " ");
+
+  // Extract leading instruction pointer if present, use single
+  // list to pass out as reference.
+  size_t Index = 0;
+  if (!Records.empty() && Records[0].find('/') == StringRef::npos) {
+    Index = 1;
+  }
+  // Now extract LBR samples - note that we do not reverse the
+  // LBR entry order so we can unwind the sample stack as we walk
+  // through LBR entries.
+  uint64_t PrevTrDst = 0;
+
+  while (Index < Records.size()) {
+    auto &Token = Records[Index++];
+    if (Token.size() == 0)
+      continue;
+
+    SmallVector<StringRef, 8> Addresses;
+    Token.split(Addresses, "/");
+    uint64_t Src;
+    uint64_t Dst;
+    Addresses[0].substr(2).getAsInteger(16, Src);
+    Addresses[1].substr(2).getAsInteger(16, Dst);
+
+    bool SrcIsInternal = Binary->addressIsCode(Src);
+    bool DstIsInternal = Binary->addressIsCode(Dst);
+    bool IsArtificial = false;
+    // Ignore branches outside the current binary.
+    if (!SrcIsInternal && !DstIsInternal)
+      continue;
+    if (!SrcIsInternal && DstIsInternal) {
+      // For transition from external code (such as dynamic libraries) to
+      // the current binary, keep track of the branch target which will be
+      // grouped with the Source of the last transition from the current
+      // binary.
+      PrevTrDst = Dst;
+      continue;
+    }
+    if (SrcIsInternal && !DstIsInternal) {
+      // For transition to external code, group the Source with the next
+      // availabe transition target.
+      if (!PrevTrDst)
+        continue;
+      Dst = PrevTrDst;
+      PrevTrDst = 0;
+      IsArtificial = true;
+    }
+    // TODO: filter out buggy duplicate branches on Skylake
+
+    LBRStack.emplace_back(LBREntry(Src, Dst, IsArtificial));
+  }
+  TraceIt.advance();
+  return !LBRStack.empty();
+}
+
+bool PerfReader::extractCallstack(TraceStream &TraceIt,
+                                  std::list<uint64_t> &CallStack) {
+  // The raw format of call stack is like:
+  //            4005dc      # leaf frame
+  //	          400634
+  //	          400684      # root frame
+  // It's in bottom-up order with each frame in one line.
+
+  // Extract stack frames from sample
+  ProfiledBinary *Binary = nullptr;
+  while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) {
+    StringRef FrameStr = TraceIt.getCurrentLine().ltrim();
+    // We might get an empty line at the beginning or comments, skip it
+    uint64_t FrameAddr = 0;
+    if (FrameStr.getAsInteger(16, FrameAddr)) {
+      TraceIt.advance();
+      break;
+    }
+    TraceIt.advance();
+    if (!Binary) {
+      Binary = getBinary(FrameAddr);
+      // we might have addr not match the MMAP, skip it
+      if (!Binary) {
+        if (AddrToBinaryMap.size() == 0)
+          WithColor::warning() << "No MMAP event in the perfscript, create it "
+                                  "with '--show-mmap-events'\n";
+        break;
+      }
+    }
+    // Currently intermixed frame from different binaries is not supported.
+    // Ignore bottom frames not from binary of interest.
+    if (!Binary->addressIsCode(FrameAddr))
+      break;
+
+    // We need to translate return address to call address
+    // for non-leaf frames
+    if (!CallStack.empty()) {
+      FrameAddr = Binary->getCallAddrFromFrameAddr(FrameAddr);
+    }
+
+    CallStack.emplace_back(FrameAddr);
+  }
+
+  if (CallStack.empty())
+    return false;
+  // Skip other unrelated line, find the next valid LBR line
+  while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) {
+    TraceIt.advance();
+  }
+  // Filter out broken stack sample. We may not have complete frame info
+  // if sample end up in prolog/epilog, the result is dangling context not
+  // connected to entry point. This should be relatively rare thus not much
+  // impact on overall profile quality. However we do want to filter them
+  // out to reduce the number of different calling contexts. One instance
+  // of such case - when sample landed in prolog/epilog, somehow stack
+  // walking will be broken in an unexpected way that higher frames will be
+  // missing.
+  return !Binary->addressInPrologEpilog(CallStack.front());
+}
+
+void PerfReader::parseHybridSample(TraceStream &TraceIt) {
+  // The raw hybird sample started with call stack in FILO order and followed
+  // intermediately by LBR sample
+  // e.g.
+  // 	          4005dc    # call stack leaf
+  //	          400634
+  //	          400684    # call stack root
+  // 0x4005c8/0x4005dc/P/-/-/0   0x40062f/0x4005b0/P/-/-/0 ...
+  //          ... 0x4005c8/0x4005dc/P/-/-/0    # LBR Entries
+  //
+  HybridSample Sample;
+
+  // Parsing call stack and populate into HybridSample.CallStack
+  if (!extractCallstack(TraceIt, Sample.CallStack)) {
+    // Skip the next LBR line matched current call stack
+    if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x"))
+      TraceIt.advance();
+    return;
+  }
+  // Set the binary current sample belongs to
+  Sample.Binary = getBinary(Sample.CallStack.front());
+
+  if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) {
+    // Parsing LBR stack and populate into HybridSample.LBRStack
+    if (extractLBRStack(TraceIt, Sample.LBRStack, Sample.Binary)) {
+      // Canonicalize stack leaf to avoid 'random' IP from leaf frame skew LBR
+      // ranges
+      Sample.CallStack.front() = Sample.LBRStack[0].Target;
+      // Record samples by aggregation
+      AggregatedSamples[Sample]++;
+    }
+  } else {
+    // LBR sample is encoded in single line after stack sample
+    exitWithError("'Hybrid perf sample is corrupted, No LBR sample line");
+  }
+}
+
 void PerfReader::parseMMap2Event(TraceStream &TraceIt) {
   // Parse a line like:
   //  PERF_RECORD_MMAP2 2113428/2113428: [0x7fd4efb57000(0x204000) @ 0
@@ -104,27 +433,74 @@
     outs() << "Mmap: Binary " << Event.BinaryPath << " loaded at "
            << format("0x%" PRIx64 ":", Event.BaseAddress) << " \n";
   }
+  TraceIt.advance();
 }
 
-void PerfReader::parseEvent(TraceStream &TraceIt) {
+void PerfReader::parseEventOrSample(TraceStream &TraceIt) {
   if (TraceIt.getCurrentLine().startswith("PERF_RECORD_MMAP2"))
     parseMMap2Event(TraceIt);
-
-  TraceIt.advance();
+  else if (getPerfScriptType() == PERF_LBR_STACK)
+    parseHybridSample(TraceIt);
+  else {
+    // TODO: parse other type sample
+    TraceIt.advance();
+  }
 }
 
-void PerfReader::parseTrace(StringRef Filename) {
+void PerfReader::parseAndAggregateTrace(StringRef Filename) {
   // Trace line iterator
   TraceStream TraceIt(Filename);
-  while (!TraceIt.isAtEoF()) {
-    parseEvent(TraceIt);
+  while (!TraceIt.isAtEoF())
+    parseEventOrSample(TraceIt);
+}
+
+void PerfReader::checkAndSetPerfType(
+    cl::list<std::string> &PerfTraceFilenames) {
+  bool HasHybridPerf = true;
+  for (auto FileName : PerfTraceFilenames) {
+    if (!isHybridPerfScript(FileName)) {
+      HasHybridPerf = false;
+      break;
+    }
+  }
+
+  if (HasHybridPerf) {
+    // Set up ProfileIsCS to enable context-sensitive functionalities
+    // in SampleProf
+    FunctionSamples::ProfileIsCS = true;
+    PerfType = PERF_LBR_STACK;
+
+  } else {
+    // TODO: Support other type of perf script
+    PerfType = PERF_INVILID;
+  }
+
+  if (BinaryTable.size() > 1) {
+    // TODO: remove this if everything is ready to support multiple binaries.
+    exitWithError("Currently only support one input binary, multiple binaries' "
+                  "profile will be merged in one profile and make profile "
+                  "summary info inaccurate. Please use `perfdata` to merge "
+                  "profiles from multiple binaries.");
+  }
+}
+
+void PerfReader::generateRawProfile() {
+  if (getPerfScriptType() == PERF_LBR_STACK) {
+    // Unwind samples if it's hybird sample
+    unwindSamples();
+  } else if (getPerfScriptType() == PERF_LBR) {
+    // TODO: range overlap computation for regular AutoFDO
   }
 }
 
 void PerfReader::parsePerfTraces(cl::list<std::string> &PerfTraceFilenames) {
-  // Parse perf traces.
+  // Check and set current perfscript type
+  checkAndSetPerfType(PerfTraceFilenames);
+  // Parse perf traces and do aggregation.
   for (auto Filename : PerfTraceFilenames)
-    parseTrace(Filename);
+    parseAndAggregateTrace(Filename);
+
+  generateRawProfile();
 }
 
 } // end namespace sampleprof
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -0,0 +1,96 @@
+//===-- ProfileGenerator.h - Profile Generator -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H
+#define LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H
+#include "ErrorHandling.h"
+#include "PerfReader.h"
+#include "ProfiledBinary.h"
+#include "llvm/ProfileData/SampleProfWriter.h"
+
+using namespace llvm;
+using namespace sampleprof;
+
+namespace llvm {
+namespace sampleprof {
+
+class ProfileGenerator {
+
+public:
+  ProfileGenerator(){};
+  virtual ~ProfileGenerator() = default;
+  static std::unique_ptr<ProfileGenerator>
+  create(const BinarySampleCounterMap &SampleCounters,
+         enum PerfScriptType SampleType);
+  virtual void generateProfile() = 0;
+
+  // Use SampleProfileWriter to serialize profile map
+  void write();
+
+protected:
+  /*
+  For each region boundary point, mark if it is begin or end (or both) of
+  the region. Boundary points are inclusive. Log the sample count as well
+  so we can use it when we compute the sample count of each disjoint region
+  later. Note that there might be multiple ranges with different sample
+  count that share same begin/end point. We need to accumulate the sample
+  count for the boundary point for such case, because for the example
+  below,
+
+  |<--100-->|
+  |<------200------>|
+  A         B       C
+
+  sample count for disjoint region [A,B] would be 300.
+  */
+  void findDisjointRanges(RangeSample &DisjointRanges,
+                          const RangeSample &Ranges);
+
+  // Used by SampleProfileWriter
+  StringMap<FunctionSamples> ProfileMap;
+};
+
+class CSProfileGenerator : public ProfileGenerator {
+  const BinarySampleCounterMap &BinarySampleCounters;
+
+public:
+  CSProfileGenerator(const BinarySampleCounterMap &Counters)
+      : BinarySampleCounters(Counters){};
+
+public:
+  void generateProfile() override {
+    // Fill in function body samples
+    populateFunctionBodySamples();
+
+    // Fill in boundary sample counts as well as call site samples for calls
+    populateFunctionBoundarySamples();
+
+    // Fill in call site value sample for inlined calls and also use context to
+    // infer missing samples. Since we don't have call count for inlined
+    // functions, we estimate it from inlinee's profile using the entry of the
+    // body sample.
+    populateInferredFunctionSamples();
+  }
+
+private:
+  // Helper function for updating body sample for a leaf location in
+  // FunctionProfile
+  void updateBodySamplesforFunctionProfile(FunctionSamples &FunctionProfile,
+                                           const FrameLocation &LeafLoc,
+                                           uint64_t Count);
+  // Lookup or create FunctionSamples for the context
+  FunctionSamples &getFunctionProfileForContext(StringRef ContextId);
+  void populateFunctionBodySamples();
+  void populateFunctionBoundarySamples();
+  void populateInferredFunctionSamples();
+};
+
+} // end namespace sampleprof
+} // end namespace llvm
+
+#endif
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -0,0 +1,329 @@
+//===-- ProfileGenerator.cpp - Profile Generator  ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProfileGenerator.h"
+
+static cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
+                                           cl::Required,
+                                           cl::desc("Output profile file"));
+
+static cl::opt<SampleProfileFormat> OutputFormat(
+    "format", cl::desc("Format of output profile"), cl::init(SPF_Text),
+    cl::values(
+        clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"),
+        clEnumValN(SPF_Compact_Binary, "compbinary", "Compact binary encoding"),
+        clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"),
+        clEnumValN(SPF_Text, "text", "Text encoding"),
+        clEnumValN(SPF_GCC, "gcc",
+                   "GCC encoding (only meaningful for -sample)")));
+
+using namespace llvm;
+using namespace sampleprof;
+
+namespace llvm {
+namespace sampleprof {
+
+std::unique_ptr<ProfileGenerator>
+ProfileGenerator::create(const BinarySampleCounterMap &BinarySampleCounters,
+                         enum PerfScriptType SampleType) {
+  std::unique_ptr<ProfileGenerator> ProfileGenerator;
+
+  if (SampleType == PERF_LBR_STACK) {
+    ProfileGenerator.reset(new CSProfileGenerator(BinarySampleCounters));
+  } else {
+    // TODO:
+    llvm_unreachable("Unsupported perfscript!");
+  }
+
+  return ProfileGenerator;
+}
+
+void ProfileGenerator::write() {
+  auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat);
+  if (std::error_code EC = WriterOrErr.getError())
+    exitWithError(EC, OutputFilename);
+  auto Writer = std::move(WriterOrErr.get());
+  Writer->write(ProfileMap);
+}
+
+void ProfileGenerator::findDisjointRanges(RangeSample &DisjointRanges,
+                                          const RangeSample &Ranges) {
+
+  /*
+  Regions may overlap with each other. Using the boundary info, find all
+  disjoint ranges and their sample count. BoundaryPoint contains the count
+  mutiple samples begin/end at this points.
+
+  |<--100-->|           Sample1
+  |<------200------>|   Sample2
+  A         B       C
+
+  In the example above,
+  Sample1 begins at A, ends at B, its value is 100.
+  Sample2 beings at A, ends at C, its value is 200.
+  For A, BeginCount is the sum of sample begins at A, which is 300 and no
+  samples ends at A, so EndCount is 0.
+  Then boundary points A, B, and C with begin/end counts are:
+  A: (300, 0)
+  B: (0, 100)
+  C: (0, 200)
+  */
+  struct BoundaryPoint {
+    // Sum of sample counts beginning at this point
+    uint64_t BeginCount;
+    // Sum of sample counts ending at this point
+    uint64_t EndCount;
+
+    BoundaryPoint() : BeginCount(0), EndCount(0){};
+
+    void addBeginCount(uint64_t Count) { BeginCount += Count; }
+
+    void addEndCount(uint64_t Count) { EndCount += Count; }
+  };
+
+  /*
+  For the above example. With boundary points, follwing logic finds two
+  disjoint region of
+
+  [A,B]:   300
+  [B+1,C]: 200
+
+  If there is a boundary point that both begin and end, the point itself
+  becomes a separate disjoint region. For example, if we have original
+  ranges of
+
+  |<--- 100 --->|
+                |<--- 200 --->|
+  A             B             C
+
+  there are three boundary points with their begin/end counts of
+
+  A: (100, 0)
+  B: (200, 100)
+  C: (0, 200)
+
+  the disjoint ranges would be
+
+  [A, B-1]: 100
+  [B, B]:   300
+  [B+1, C]: 200.
+  */
+  std::map<uint64_t, BoundaryPoint> Boundaries;
+
+  for (auto Item : Ranges) {
+    uint64_t Begin = Item.first.first;
+    uint64_t End = Item.first.second;
+    uint64_t Count = Item.second;
+    if (Boundaries.find(Begin) == Boundaries.end())
+      Boundaries[Begin] = BoundaryPoint();
+    Boundaries[Begin].addBeginCount(Count);
+
+    if (Boundaries.find(End) == Boundaries.end())
+      Boundaries[End] = BoundaryPoint();
+    Boundaries[End].addEndCount(Count);
+  }
+
+  uint64_t BeginAddress = 0;
+  int Count = 0;
+  for (auto Item : Boundaries) {
+    uint64_t Address = Item.first;
+    BoundaryPoint &Point = Item.second;
+    if (Point.BeginCount) {
+      if (BeginAddress)
+        DisjointRanges[{BeginAddress, Address - 1}] = Count;
+      Count += Point.BeginCount;
+      BeginAddress = Address;
+    }
+    if (Point.EndCount) {
+      assert(BeginAddress && "First boundary point cannot be 'end' point");
+      DisjointRanges[{BeginAddress, Address}] = Count;
+      Count -= Point.EndCount;
+      BeginAddress = Address + 1;
+    }
+  }
+}
+
+FunctionSamples &
+CSProfileGenerator::getFunctionProfileForContext(StringRef ContextStr) {
+  auto Ret = ProfileMap.try_emplace(ContextStr, FunctionSamples());
+  if (Ret.second) {
+    SampleContext FContext(Ret.first->first(), RawContext);
+    FunctionSamples &FProfile = Ret.first->second;
+    FProfile.setName(FContext.getName());
+    FProfile.setContext(FContext);
+  }
+  return Ret.first->second;
+}
+
+void CSProfileGenerator::updateBodySamplesforFunctionProfile(
+    FunctionSamples &FunctionProfile, const FrameLocation &LeafLoc,
+    uint64_t Count) {
+  // Filter out invalid negative(int type) lineOffset
+  if (LeafLoc.second.LineOffset & 0x80000000)
+    return;
+  // Use the maximum count of samples with same line location
+  ErrorOr<uint64_t> R = FunctionProfile.findSamplesAt(
+      LeafLoc.second.LineOffset, LeafLoc.second.Discriminator);
+  uint64_t PreviousCount = R ? R.get() : 0;
+  if (PreviousCount < Count) {
+    FunctionProfile.addBodySamples(LeafLoc.second.LineOffset,
+                                   LeafLoc.second.Discriminator,
+                                   Count - PreviousCount);
+    FunctionProfile.addTotalSamples(Count - PreviousCount);
+  }
+}
+
+void CSProfileGenerator::populateFunctionBodySamples() {
+  for (const auto &BI : BinarySampleCounters) {
+    ProfiledBinary *Binary = BI.first;
+    for (const auto &CI : BI.second.RangeCounter) {
+      StringRef ContextId(CI.first);
+      // Get or create function profile for the range
+      FunctionSamples &FunctionProfile =
+          getFunctionProfileForContext(ContextId);
+      // Compute disjoint ranges first, so we can use MAX
+      // for calculating count for each location.
+      RangeSample Ranges;
+      findDisjointRanges(Ranges, CI.second);
+
+      for (auto Range : Ranges) {
+        uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first);
+        uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second);
+        uint64_t Count = Range.second;
+        // Disjoint ranges have introduce zero-filled gap that
+        // doesn't belong to current context, filter them out.
+        if (Count == 0)
+          continue;
+
+        InstructionPointer IP(Binary, RangeBegin, true);
+
+        // Disjoint ranges may have range in the middle of two instr,
+        // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
+        // can be Addr1+1 to Addr2-1. We should ignore such range.
+        if (IP.Address > RangeEnd)
+          continue;
+
+        while (IP.Address <= RangeEnd) {
+          uint64_t Offset = Binary->virtualAddrToOffset(IP.Address);
+          const FrameLocation &LeafLoc = Binary->getInlineLeafFrameLoc(Offset);
+          // Recording body sample for this specific context
+          updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc, Count);
+          // Move to next IP within the range
+          IP.advance();
+        }
+      }
+    }
+  }
+}
+
+void CSProfileGenerator::populateFunctionBoundarySamples() {
+  for (const auto &BI : BinarySampleCounters) {
+    ProfiledBinary *Binary = BI.first;
+    for (const auto &CI : BI.second.BranchCounter) {
+      StringRef ContextId(CI.first);
+      // Get or create function profile for branch Source
+      FunctionSamples &FunctionProfile =
+          getFunctionProfileForContext(ContextId);
+
+      for (auto Entry : CI.second) {
+        uint64_t SourceOffset = Entry.first.first;
+        uint64_t TargetOffset = Entry.first.second;
+        uint64_t Count = Entry.second;
+        // Get the callee name by branch target if it's a call branch
+        StringRef CalleeName = FunctionSamples::getCanonicalFnName(
+            Binary->getFuncFromStartOffset(TargetOffset));
+        if (CalleeName.size() == 0)
+          continue;
+
+        // Record called target sample and its count
+        const FrameLocation &LeafLoc =
+            Binary->getInlineLeafFrameLoc(SourceOffset);
+
+        FunctionProfile.addCalledTargetSamples(LeafLoc.second.LineOffset,
+                                               LeafLoc.second.Discriminator,
+                                               CalleeName, Count);
+        FunctionProfile.addTotalSamples(Count);
+
+        // Record head sample for called target(callee)
+        // TODO: Cleanup ' @ '
+        std::string CalleeContextId =
+            getCallSite(LeafLoc) + " @ " + CalleeName.str();
+        if (ContextId.find(" @ ") != StringRef::npos) {
+          CalleeContextId =
+              ContextId.rsplit(" @ ").first.str() + " @ " + CalleeContextId;
+        }
+
+        if (ProfileMap.find(CalleeContextId) != ProfileMap.end()) {
+          FunctionSamples &CalleeProfile = ProfileMap[CalleeContextId];
+          assert(Count != 0 && "Unexpected zero weight branch");
+          if (CalleeProfile.getName().size()) {
+            CalleeProfile.addHeadSamples(Count);
+          }
+        }
+      }
+    }
+  }
+}
+
+static FrameLocation getCallerContext(StringRef CalleeContext,
+                                      StringRef &CallerNameWithContext) {
+  StringRef CallerContext = CalleeContext.rsplit(" @ ").first;
+  CallerNameWithContext = CallerContext.rsplit(':').first;
+  auto ContextSplit = CallerContext.rsplit(" @ ");
+  FrameLocation LeafFrameLoc = {"", {0, 0}};
+  StringRef Funcname;
+  SampleContext::decodeContextString(ContextSplit.second, Funcname,
+                                     LeafFrameLoc.second);
+  LeafFrameLoc.first = Funcname.str();
+  return LeafFrameLoc;
+}
+
+void CSProfileGenerator::populateInferredFunctionSamples() {
+  for (const auto &Item : ProfileMap) {
+    const StringRef CalleeContext = Item.first();
+    const FunctionSamples &CalleeProfile = Item.second;
+
+    // If we already have head sample counts, we must have value profile
+    // for call sites added already. Skip to avoid double counting.
+    if (CalleeProfile.getHeadSamples())
+      continue;
+    // If we don't have context, nothing to do for caller's call site.
+    // This could happen for entry point function.
+    if (CalleeContext.find(" @ ") == StringRef::npos)
+      continue;
+
+    // Infer Caller's frame loc and context ID through string splitting
+    StringRef CallerContextId;
+    FrameLocation &&CallerLeafFrameLoc =
+        getCallerContext(CalleeContext, CallerContextId);
+
+    // It's possible that we haven't seen any sample directly in the caller,
+    // in which case CallerProfile will not exist. But we can't modify
+    // ProfileMap while iterating it.
+    // TODO: created function profile for those callers too
+    if (ProfileMap.find(CallerContextId) == ProfileMap.end())
+      continue;
+    FunctionSamples &CallerProfile = ProfileMap[CallerContextId];
+
+    // Since we don't have call count for inlined functions, we
+    // estimate it from inlinee's profile using entry body sample.
+    uint64_t EstimatedCallCount = CalleeProfile.getEntrySamples();
+    // If we don't have samples with location, use 1 to indicate live.
+    if (!EstimatedCallCount && !CalleeProfile.getBodySamples().size())
+      EstimatedCallCount = 1;
+    CallerProfile.addCalledTargetSamples(
+        CallerLeafFrameLoc.second.LineOffset,
+        CallerLeafFrameLoc.second.Discriminator, CalleeProfile.getName(),
+        EstimatedCallCount);
+    updateBodySamplesforFunctionProfile(CallerProfile, CallerLeafFrameLoc,
+                                        EstimatedCallCount);
+  }
+}
+
+} // end namespace sampleprof
+} // end namespace llvm
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -24,13 +24,18 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Object/ELFObjectFile.h"
+#include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/Path.h"
+#include <list>
 #include <set>
+#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
+using namespace llvm;
+using namespace sampleprof;
 using namespace llvm::object;
 
 namespace llvm {
@@ -40,14 +45,50 @@
 
 struct InstructionPointer {
   ProfiledBinary *Binary;
-  // Offset to the base address of the executable segment of the binary.
-  uint64_t Offset;
+  union {
+    // Offset of the executable segment of the binary.
+    uint64_t Offset = 0;
+    // Also used as address in unwinder
+    uint64_t Address;
+  };
   // Index to the sorted code address array of the binary.
-  uint64_t Index;
+  uint64_t Index = 0;
+  InstructionPointer(ProfiledBinary *Binary, uint64_t Address,
+                     bool RoundToNext = false);
+  void advance();
+  void backward();
+  void update(uint64_t Addr);
+};
+
+// PrologEpilog offset tracker, used to filter out broken stack samples
+// Currently we use a heuristic size (two) to infer prolog and epilog
+// based on the start address and return address. In the future,
+// we will switch to Dwarf CFI based tracker
+struct PrologEpilogTracker {
+  // A set of prolog and epilog offsets. Used by virtual unwinding.
+  std::unordered_set<uint64_t> PrologEpilogSet;
+  ProfiledBinary *Binary;
+  PrologEpilogTracker(ProfiledBinary *Bin) : Binary(Bin){};
+
+  // Take the two addresses from the start of function as prolog
+  void inferPrologOffsets(
+      std::unordered_map<uint64_t, std::string> &FuncStartAddrMap) {
+    for (auto I : FuncStartAddrMap) {
+      PrologEpilogSet.insert(I.first);
+      InstructionPointer IP(Binary, I.first);
+      IP.advance();
+      PrologEpilogSet.insert(IP.Offset);
+    }
+  }
 
-  InstructionPointer(ProfiledBinary *Binary, uint64_t Offset)
-      : Binary(Binary), Offset(Offset) {
-    Index = 0;
+  // Take the last two addresses before the return address as epilog
+  void inferEpilogOffsets(std::unordered_set<uint64_t> &RetAddrs) {
+    for (auto Addr : RetAddrs) {
+      PrologEpilogSet.insert(Addr);
+      InstructionPointer IP(Binary, Addr);
+      IP.backward();
+      PrologEpilogSet.insert(IP.Offset);
+    }
   }
 };
 
@@ -67,12 +108,14 @@
   std::unique_ptr<const MCInstrInfo> MII;
   std::unique_ptr<MCDisassembler> DisAsm;
   std::unique_ptr<const MCInstrAnalysis> MIA;
-  std::unique_ptr<MCInstPrinter> IP;
+  std::unique_ptr<MCInstPrinter> IPrinter;
   // A list of text sections sorted by start RVA and size. Used to check
   // if a given RVA is a valid code address.
   std::set<std::pair<uint64_t, uint64_t>> TextSections;
   // Function offset to name mapping.
   std::unordered_map<uint64_t, std::string> FuncStartAddrMap;
+  // Offset to context location map. Used to expand the context.
+  std::unordered_map<uint64_t, FrameLocationStack> Offset2LocStackMap;
   // An array of offsets of all instructions sorted in increasing order. The
   // sorting is needed to fast advance to the next forward/backward instruction.
   std::vector<uint64_t> CodeAddrs;
@@ -81,9 +124,10 @@
   // A set of return instruction offsets. Used by virtual unwinding.
   std::unordered_set<uint64_t> RetAddrs;
 
+  PrologEpilogTracker ProEpilogTracker;
+
   // The symbolizer used to get inline context for an instruction.
   std::unique_ptr<symbolize::LLVMSymbolizer> Symbolizer;
-
   void setPreferredBaseAddress(const ELFObjectFileBase *O);
 
   // Set up disassembler and related components.
@@ -97,7 +141,8 @@
   bool dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
                           SectionSymbolsTy &Symbols, const SectionRef &Section);
   /// Symbolize a given instruction pointer and return a full call context.
-  FrameLocationStack symbolize(const InstructionPointer &I);
+  FrameLocationStack symbolize(const InstructionPointer &IP,
+                               bool UseCanonicalFnName = false);
 
   /// Decode the interesting parts of the binary and build internal data
   /// structures. On high level, the parts of interest are:
@@ -107,17 +152,81 @@
   ///   3. Pseudo probe related sections, used by probe-based profile
   ///   generation.
   void load();
+  const FrameLocationStack &getFrameLocationStack(uint64_t Offset) const {
+    auto I = Offset2LocStackMap.find(Offset);
+    assert(I != Offset2LocStackMap.end() &&
+           "Can't find location for offset in the binary");
+    return I->second;
+  }
 
 public:
-  ProfiledBinary(StringRef Path) : Path(Path) {
+  ProfiledBinary(StringRef Path) : Path(Path), ProEpilogTracker(this) {
     setupSymbolizer();
     load();
   }
-
+  uint64_t virtualAddrToOffset(uint64_t VitualAddress) const {
+    return VitualAddress - BaseAddress;
+  }
+  uint64_t offsetToVirtualAddr(uint64_t Offset) const {
+    return Offset + BaseAddress;
+  }
   const StringRef getPath() const { return Path; }
   const StringRef getName() const { return llvm::sys::path::filename(Path); }
   uint64_t getBaseAddress() const { return BaseAddress; }
   void setBaseAddress(uint64_t Address) { BaseAddress = Address; }
+  uint64_t getPreferredBaseAddress() const { return PreferredBaseAddress; }
+
+  bool addressIsCode(uint64_t Address) const {
+    uint64_t Offset = virtualAddrToOffset(Address);
+    return Offset2LocStackMap.find(Offset) != Offset2LocStackMap.end();
+  }
+  bool addressIsCall(uint64_t Address) const {
+    uint64_t Offset = virtualAddrToOffset(Address);
+    return CallAddrs.count(Offset);
+  }
+  bool addressIsReturn(uint64_t Address) const {
+    uint64_t Offset = virtualAddrToOffset(Address);
+    return RetAddrs.count(Offset);
+  }
+  bool addressInPrologEpilog(uint64_t Address) const {
+    uint64_t Offset = virtualAddrToOffset(Address);
+    return ProEpilogTracker.PrologEpilogSet.count(Offset);
+  }
+
+  uint64_t getAddressforIndex(uint64_t Index) const {
+    return offsetToVirtualAddr(CodeAddrs[Index]);
+  }
+
+  // Get the index in CodeAddrs for the address
+  // As we might get an address which is not the code
+  // here it would round to the next valid code address by
+  // using lower bound operation
+  uint32_t getIndexForAddr(uint64_t Address) const {
+    uint64_t Offset = virtualAddrToOffset(Address);
+    auto Low = std::lower_bound(CodeAddrs.begin(), CodeAddrs.end(), Offset);
+    return Low - CodeAddrs.begin();
+  }
+
+  uint64_t getCallAddrFromFrameAddr(uint64_t FrameAddr) const {
+    return getAddressforIndex(getIndexForAddr(FrameAddr) - 1);
+  }
+
+  StringRef getFuncFromStartOffset(uint64_t Offset) {
+    return FuncStartAddrMap[Offset];
+  }
+
+  const FrameLocation &getInlineLeafFrameLoc(uint64_t Offset,
+                                             bool NameOnly = false) {
+    return getFrameLocationStack(Offset).back();
+  }
+
+  // Compare two addresses' inline context
+  bool inlineContextEqual(uint64_t Add1, uint64_t Add2) const;
+
+  // Get the context string of the current stack with inline context filled in.
+  // It will search the disassembling info stored in Offset2LocStackMap. This is
+  // used as the key of function sample map
+  std::string getExpandedContextStr(const std::list<uint64_t> &stack) const;
 };
 
 } // end namespace sampleprof
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -18,6 +18,7 @@
 #define DEBUG_TYPE "load-binary"
 
 using namespace llvm;
+using namespace sampleprof;
 
 static cl::opt<bool> ShowDisassembly("show-disassembly", cl::ReallyHidden,
                                      cl::init(false), cl::ZeroOrMore,
@@ -95,11 +96,63 @@
   // Disassemble the text sections.
   disassemble(Obj);
 
+  // Use function start and return address to infer prolog and epilog
+  ProEpilogTracker.inferPrologOffsets(FuncStartAddrMap);
+  ProEpilogTracker.inferEpilogOffsets(RetAddrs);
+
   // TODO: decode other sections.
 
   return;
 }
 
+bool ProfiledBinary::inlineContextEqual(uint64_t Address1,
+                                        uint64_t Address2) const {
+  uint64_t Offset1 = virtualAddrToOffset(Address1);
+  uint64_t Offset2 = virtualAddrToOffset(Address2);
+  const FrameLocationStack &Context1 = getFrameLocationStack(Offset1);
+  const FrameLocationStack &Context2 = getFrameLocationStack(Offset2);
+  if (Context1.size() != Context2.size())
+    return false;
+
+  // The leaf frame contains location within the leaf, and it
+  // needs to be remove that as it's not part of the calling context
+  return std::equal(Context1.begin(), Context1.begin() + Context1.size() - 1,
+                    Context2.begin(), Context2.begin() + Context2.size() - 1);
+}
+
+std::string
+ProfiledBinary::getExpandedContextStr(const std::list<uint64_t> &Stack) const {
+  std::string ContextStr;
+  SmallVector<std::string, 8> ContextVec;
+  // Process from frame root to leaf
+  for (auto Iter = Stack.rbegin(); Iter != Stack.rend(); Iter++) {
+    uint64_t Offset = virtualAddrToOffset(*Iter);
+    const FrameLocationStack &ExpandedContext = getFrameLocationStack(Offset);
+    for (const auto &Loc : ExpandedContext) {
+      ContextVec.push_back(getCallSite(Loc));
+    }
+  }
+
+  assert(ContextVec.size() && "Context length should be at least 1");
+
+  std::ostringstream OContextStr;
+  for (uint32_t I = 0; I < (uint32_t)ContextVec.size(); I++) {
+    if (OContextStr.str().size()) {
+      OContextStr << " @ ";
+    }
+
+    if (I == ContextVec.size() - 1) {
+      // Only keep the function name for the leaf frame
+      StringRef Ref(ContextVec[I]);
+      OContextStr << Ref.split(":").first.str();
+    } else {
+      OContextStr << ContextVec[I];
+    }
+  }
+
+  return OContextStr.str();
+}
+
 void ProfiledBinary::setPreferredBaseAddress(const ELFObjectFileBase *Obj) {
   for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end();
        SI != SE; ++SI) {
@@ -142,7 +195,7 @@
     if (ShowDisassembly) {
       outs() << format("%8" PRIx64 ":", Offset);
       size_t Start = outs().tell();
-      IP->printInst(&Inst, Offset + Size, "", *STI.get(), outs());
+      IPrinter->printInst(&Inst, Offset + Size, "", *STI.get(), outs());
       if (ShowSourceLocations) {
         unsigned Cur = outs().tell() - Start;
         if (Cur < 40)
@@ -155,6 +208,10 @@
 
     const MCInstrDesc &MCDesc = MII->get(Inst.getOpcode());
 
+    // Populate a vector of the symbolized callsite at this location
+    InstructionPointer IP(this, Offset);
+    Offset2LocStackMap[Offset] = symbolize(IP, true);
+
     // Populate address maps.
     CodeAddrs.push_back(Offset);
     if (MCDesc.isCall())
@@ -206,9 +263,9 @@
   MIA.reset(TheTarget->createMCInstrAnalysis(MII.get()));
 
   int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
-  IP.reset(TheTarget->createMCInstPrinter(Triple(TripleName), AsmPrinterVariant,
-                                          *AsmInfo, *MII, *MRI));
-  IP->setPrintBranchImmAsAddress(true);
+  IPrinter.reset(TheTarget->createMCInstPrinter(
+      Triple(TripleName), AsmPrinterVariant, *AsmInfo, *MII, *MRI));
+  IPrinter->setPrintBranchImmAsAddress(true);
 }
 
 void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) {
@@ -283,7 +340,8 @@
   Symbolizer = std::make_unique<symbolize::LLVMSymbolizer>(SymbolizerOpts);
 }
 
-FrameLocationStack ProfiledBinary::symbolize(const InstructionPointer &IP) {
+FrameLocationStack ProfiledBinary::symbolize(const InstructionPointer &IP,
+                                             bool UseCanonicalFnName) {
   assert(this == IP.Binary &&
          "Binary should only symbolize its own instruction");
   auto Addr = object::SectionedAddress{IP.Offset + PreferredBaseAddress,
@@ -297,14 +355,43 @@
     const auto &CallerFrame = InlineStack.getFrame(I);
     if (CallerFrame.FunctionName == "<invalid>")
       break;
+    StringRef FunctionName(CallerFrame.FunctionName);
+    if (UseCanonicalFnName)
+      FunctionName = FunctionSamples::getCanonicalFnName(FunctionName);
     LineLocation Line(CallerFrame.Line - CallerFrame.StartLine,
                       CallerFrame.Discriminator);
-    FrameLocation Callsite(CallerFrame.FunctionName, Line);
+    FrameLocation Callsite(FunctionName.str(), Line);
     CallStack.push_back(Callsite);
   }
 
   return CallStack;
 }
 
+InstructionPointer::InstructionPointer(ProfiledBinary *Binary, uint64_t Address,
+                                       bool RoundToNext)
+    : Binary(Binary), Address(Address) {
+  Index = Binary->getIndexForAddr(Address);
+  if (RoundToNext) {
+    // we might get address which is not the code
+    // it should round to the next valid address
+    this->Address = Binary->getAddressforIndex(Index);
+  }
+}
+
+void InstructionPointer::advance() {
+  Index++;
+  Address = Binary->getAddressforIndex(Index);
+}
+
+void InstructionPointer::backward() {
+  Index--;
+  Address = Binary->getAddressforIndex(Index);
+}
+
+void InstructionPointer::update(uint64_t Addr) {
+  Address = Addr;
+  Index = Binary->getIndexForAddr(Address);
+}
+
 } // end namespace sampleprof
 } // end namespace llvm
diff --git a/llvm/tools/llvm-profgen/llvm-profgen.cpp b/llvm/tools/llvm-profgen/llvm-profgen.cpp
--- a/llvm/tools/llvm-profgen/llvm-profgen.cpp
+++ b/llvm/tools/llvm-profgen/llvm-profgen.cpp
@@ -1,4 +1,4 @@
-//===- llvm-profgen.cpp - LLVM SPGO profile generation tool ---------------===//
+//===- llvm-profgen.cpp - LLVM SPGO profile generation tool -----*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,6 +12,7 @@
 
 #include "ErrorHandling.h"
 #include "PerfReader.h"
+#include "ProfileGenerator.h"
 #include "ProfiledBinary.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
@@ -28,10 +29,6 @@
                     llvm::cl::MiscFlags::CommaSeparated,
                     cl::desc("Path of profiled binary files"));
 
-static cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
-                                           cl::Required,
-                                           cl::desc("Output profile file"));
-
 using namespace llvm;
 using namespace sampleprof;
 
@@ -49,5 +46,10 @@
   PerfReader Reader(BinaryFilenames);
   Reader.parsePerfTraces(PerfTraceFilenames);
 
+  std::unique_ptr<ProfileGenerator> Generator = ProfileGenerator::create(
+      Reader.getBinarySampleCounters(), Reader.getPerfScriptType());
+  Generator->generateProfile();
+  Generator->write();
+
   return EXIT_SUCCESS;
 }