diff --git a/lldb/include/lldb/Target/Trace.h b/lldb/include/lldb/Target/Trace.h
--- a/lldb/include/lldb/Target/Trace.h
+++ b/lldb/include/lldb/Target/Trace.h
@@ -240,6 +240,9 @@
 
   using OnBinaryDataReadCallback =
       std::function<llvm::Error(llvm::ArrayRef<uint8_t> data)>;
+  using OnCoresBinaryDataReadCallback = std::function<llvm::Error(
+      const llvm::DenseMap<lldb::core_id_t, llvm::ArrayRef<uint8_t>>
+          &core_to_data)>;
 
   /// Fetch binary data associated with a thread, either live or postmortem, and
   /// pass it to the given callback. The reason of having a callback is to free
@@ -292,6 +295,12 @@
                                    llvm::StringRef kind,
                                    OnBinaryDataReadCallback callback);
 
+  /// Similar to \a OnCoreBinaryDataRead but this is able to fetch the same data
+  /// from multiple cores at once.
+  llvm::Error OnCoresBinaryDataRead(const std::set<lldb::core_id_t> core_ids,
+                                    llvm::StringRef kind,
+                                    OnCoresBinaryDataReadCallback callback);
+
   /// \return
   ///     All the currently traced processes.
   std::vector<Process *> GetAllProcesses();
@@ -518,7 +527,12 @@
 
   /// core id -> data kind -> size
   llvm::DenseMap<lldb::core_id_t, std::unordered_map<std::string, uint64_t>>
+      m_live_core_data_sizes;
+  /// core id -> data kind -> bytes
+  llvm::DenseMap<lldb::core_id_t,
+                 std::unordered_map<std::string, std::vector<uint8_t>>>
       m_live_core_data;
+
   /// data kind -> size
   std::unordered_map<std::string, uint64_t> m_live_process_data;
   /// \}
diff --git a/lldb/include/lldb/Utility/TraceIntelPTGDBRemotePackets.h b/lldb/include/lldb/Utility/TraceIntelPTGDBRemotePackets.h
--- a/lldb/include/lldb/Utility/TraceIntelPTGDBRemotePackets.h
+++ b/lldb/include/lldb/Utility/TraceIntelPTGDBRemotePackets.h
@@ -80,9 +80,9 @@
   ///
   /// \return
   ///   Nanosecond wall time.
-  std::chrono::nanoseconds ToNanos(uint64_t tsc) const;
+  uint64_t ToNanos(uint64_t tsc) const;
 
-  uint64_t ToTSC(std::chrono::nanoseconds nanos) const;
+  uint64_t ToTSC(uint64_t nanos) const;
 
   uint32_t time_mult;
   uint16_t time_shift;
diff --git a/lldb/source/Plugins/Trace/intel-pt/CMakeLists.txt b/lldb/source/Plugins/Trace/intel-pt/CMakeLists.txt
--- a/lldb/source/Plugins/Trace/intel-pt/CMakeLists.txt
+++ b/lldb/source/Plugins/Trace/intel-pt/CMakeLists.txt
@@ -18,6 +18,7 @@
   DecodedThread.cpp
   TaskTimer.cpp
   LibiptDecoder.cpp
+  PerfContextSwitchDecoder.cpp
   ThreadDecoder.cpp
   TraceCursorIntelPT.cpp
   TraceIntelPT.cpp
diff --git a/lldb/source/Plugins/Trace/intel-pt/DecodedThread.h b/lldb/source/Plugins/Trace/intel-pt/DecodedThread.h
--- a/lldb/source/Plugins/Trace/intel-pt/DecodedThread.h
+++ b/lldb/source/Plugins/Trace/intel-pt/DecodedThread.h
@@ -243,10 +243,10 @@
 
   lldb::ThreadSP GetThread();
 
-private:
   /// Append a decoding error given an llvm::Error.
   void AppendError(llvm::Error &&error);
 
+private:
   /// Notify this class that the last added instruction or error has
   /// an associated TSC.
   void RecordTscForLastInstruction(uint64_t tsc);
diff --git a/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.h b/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.h
--- a/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.h
+++ b/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.h
@@ -9,20 +9,50 @@
 #ifndef LLDB_SOURCE_PLUGINS_TRACE_LIBIPT_DECODER_H
 #define LLDB_SOURCE_PLUGINS_TRACE_LIBIPT_DECODER_H
 
-#include "intel-pt.h"
-
 #include "DecodedThread.h"
+#include "PerfContextSwitchDecoder.h"
 #include "forward-declarations.h"
 
+#include "intel-pt.h"
+
 namespace lldb_private {
 namespace trace_intel_pt {
 
+struct IntelPTThreadSubtrace {
+  uint64_t tsc;
+  uint64_t psb_offset;
+};
+
+/// This struct represents a continuous execution of a thread in a core,
+/// delimited by a context switch in and out, and a list of Intel PT subtraces
+/// that belong to this execution.
+struct IntelPTThreadContinousExecution {
+  ThreadContinuousExecution thread_execution;
+  std::vector<IntelPTThreadSubtrace> intelpt_subtraces;
+
+  IntelPTThreadContinousExecution(
+      const ThreadContinuousExecution &thread_execution)
+      : thread_execution(thread_execution) {}
+
+  /// Comparator by time
+  bool operator<(const IntelPTThreadContinousExecution &o) const;
+};
+
 /// Decode a raw Intel PT trace given in \p buffer and append the decoded
 /// instructions and errors in \p decoded_thread. It uses the low level libipt
 /// library underneath.
 void DecodeTrace(DecodedThread &decoded_thread, TraceIntelPT &trace_intel_pt,
                  llvm::ArrayRef<uint8_t> buffer);
 
+void DecodeTrace(
+    DecodedThread &decoded_thread, TraceIntelPT &trace_intel_pt,
+    const llvm::DenseMap<lldb::core_id_t, llvm::ArrayRef<uint8_t>> &buffers,
+    const std::vector<IntelPTThreadContinousExecution> &executions);
+
+llvm::Expected<std::vector<IntelPTThreadSubtrace>>
+SplitTraceInContinuousExecutions(TraceIntelPT &trace_intel_pt,
+                                 llvm::ArrayRef<uint8_t> buffer);
+
 } // namespace trace_intel_pt
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp b/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp
--- a/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp
+++ b/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp
@@ -50,18 +50,33 @@
 
   /// Decode all the instructions until the end of the trace.
   /// The decoding flow is based on
-  /// https://github.com/intel/libipt/blob/master/doc/howto_libipt.md#the-instruction-flow-decode-loop
-  /// but with some relaxation to allow for gaps in the trace.
+  /// https://github.com/intel/libipt/blob/master/doc/howto_libipt.md#the-instruction-flow-decode-loop.
   void DecodeUntilEndOfTrace() {
-    int status = pte_ok;
-    while (!IsLibiptError(status = FindNextSynchronizationPoint())) {
-      // We have synchronized, so we can start decoding instructions and
-      // events.
-      // Multiple loops indicate gaps in the trace.
+    // Multiple loops indicate gaps in the trace, which are found by the inner
+    // call to DecodeInstructionsAndEvents.
+    while (true) {
+      int status = pt_insn_sync_forward(&m_decoder);
+
+      if (IsLibiptError(status)) {
+        m_decoded_thread.Append(DecodedInstruction(status));
+        break;
+      }
+
       DecodeInstructionsAndEvents(status);
     }
   }
 
+  /// Decode all the instructions that belong to the same PSB packet given its
+  /// offset.
+  void DecodePSB(uint64_t psb_offset) {
+    int status = pt_insn_sync_set(&m_decoder, psb_offset);
+    if (IsLibiptError(status)) {
+      m_decoded_thread.Append(DecodedInstruction(status));
+      return;
+    }
+    DecodeInstructionsAndEvents(status, /*stop_on_psb_change=*/true);
+  }
+
 private:
   /// Invoke the low level function \a pt_insn_next and store the decoded
   /// instruction in the given \a DecodedInstruction.
@@ -75,13 +90,37 @@
     return pt_insn_next(&m_decoder, &insn.pt_insn, sizeof(insn.pt_insn));
   }
 
-  /// Decode all the instructions and events until an error is found or the end
-  /// of the trace is reached.
+  /// Decode all the instructions and events until an error is found, the end
+  /// of the trace is reached, or optionally a new PSB is reached.
   ///
   /// \param[in] status
   ///   The status that was result of synchronizing to the most recent PSB.
-  void DecodeInstructionsAndEvents(int status) {
-    while (DecodedInstruction insn = ProcessPTEvents(status)) {
+  ///
+  /// \param[in] stop_on_psb_change
+  ///   If \b true, decoding
+  ///   An optional offset to a given PSB. Decoding stops if a different PSB is
+  ///   reached.
+  void DecodeInstructionsAndEvents(int status,
+                                   bool stop_on_psb_change = false) {
+    uint64_t psb_offset;
+    pt_insn_get_sync_offset(&m_decoder,
+                            &psb_offset); // this can't fail because we got here
+
+    while (true) {
+      DecodedInstruction insn = ProcessPTEvents(status);
+      if (!insn) {
+        m_decoded_thread.Append(insn);
+        break;
+      }
+
+      if (stop_on_psb_change) {
+        uint64_t cur_psb_offset;
+        pt_insn_get_sync_offset(
+            &m_decoder, &cur_psb_offset); // this can't fail because we got here
+        if (cur_psb_offset != psb_offset)
+          break;
+      }
+
       // The status returned by DecodeNextInstruction will need to be processed
       // by ProcessPTEvents in the next loop if it is not an error.
       if (IsLibiptError(status = DecodeNextInstruction(insn))) {
@@ -112,31 +151,6 @@
     // of trace stream (eos) is reached or pt_insn_sync_forward() passes.
     int status = pt_insn_sync_forward(&m_decoder);
 
-    if (!IsEndOfStream(status) && IsLibiptError(status)) {
-      uint64_t decoder_offset = 0;
-      int errcode_off = pt_insn_get_offset(&m_decoder, &decoder_offset);
-      if (!IsLibiptError(errcode_off)) { // we could get the offset
-        while (true) {
-          status = pt_insn_sync_forward(&m_decoder);
-          if (!IsLibiptError(status) || IsEndOfStream(status))
-            break;
-
-          uint64_t new_decoder_offset = 0;
-          errcode_off = pt_insn_get_offset(&m_decoder, &new_decoder_offset);
-          if (IsLibiptError(errcode_off))
-            break; // We can't further synchronize.
-          else if (new_decoder_offset <= decoder_offset) {
-            // We tried resyncing the decoder and it didn't make any progress
-            // because the offset didn't change. We will not make any further
-            // progress. Hence, we stop in this situation.
-            break;
-          }
-          // We'll try again starting from a new offset.
-          decoder_offset = new_decoder_offset;
-        }
-      }
-    }
-
     // We make this call to record any synchronization errors.
     if (IsLibiptError(status))
       m_decoded_thread.Append(DecodedInstruction(status));
@@ -148,8 +162,6 @@
   /// instruction e.g. timing events like ptev_tick, or paging events like
   /// ptev_paging.
   ///
-  /// If an error is found, it will be appended to the trace.
-  ///
   /// \param[in] status
   ///   The status gotten from the previous instruction decoding or PSB
   ///   synchronization.
@@ -199,8 +211,6 @@
     RefreshTscInfo();
     if (m_tsc_info)
       insn.tsc = m_tsc_info.tsc;
-    if (!insn)
-      m_decoded_thread.Append(insn);
     return insn;
   }
 
@@ -264,8 +274,7 @@
     std::unique_ptr<pt_insn_decoder, decltype(DecoderDeleter)>;
 
 static Expected<PtInsnDecoderUP>
-CreateInstructionDecoder(DecodedThread &decoded_thread,
-                         TraceIntelPT &trace_intel_pt,
+CreateInstructionDecoder(TraceIntelPT &trace_intel_pt,
                          ArrayRef<uint8_t> buffer) {
   Expected<pt_cpu> cpu_info = trace_intel_pt.GetCPUInfo();
   if (!cpu_info)
@@ -287,25 +296,151 @@
   pt_insn_decoder *decoder_ptr = pt_insn_alloc_decoder(&config);
   if (!decoder_ptr)
     return make_error<IntelPTError>(-pte_nomem);
-  PtInsnDecoderUP decoder_up(decoder_ptr, DecoderDeleter);
 
-  pt_image *image = pt_insn_get_image(decoder_ptr);
-  Process *process = decoded_thread.GetThread()->GetProcess().get();
+  return PtInsnDecoderUP(decoder_ptr, DecoderDeleter);
+}
+
+static Error SetupMemoryImage(PtInsnDecoderUP &decoder_up, Process &process) {
+  pt_image *image = pt_insn_get_image(decoder_up.get());
 
+  int status = pte_ok;
   if (IsLibiptError(
-          status = pt_image_set_callback(image, ReadProcessMemory, process)))
+          status = pt_image_set_callback(image, ReadProcessMemory, &process)))
     return make_error<IntelPTError>(status);
-  return decoder_up;
+  return Error::success();
 }
 
 void lldb_private::trace_intel_pt::DecodeTrace(DecodedThread &decoded_thread,
                                                TraceIntelPT &trace_intel_pt,
                                                ArrayRef<uint8_t> buffer) {
   Expected<PtInsnDecoderUP> decoder_up =
-      CreateInstructionDecoder(decoded_thread, trace_intel_pt, buffer);
+      CreateInstructionDecoder(trace_intel_pt, buffer);
   if (!decoder_up)
     return decoded_thread.SetAsFailed(decoder_up.takeError());
 
+  if (Error err = SetupMemoryImage(*decoder_up,
+                                   *decoded_thread.GetThread()->GetProcess()))
+    return decoded_thread.SetAsFailed(std::move(err));
+
   LibiptDecoder libipt_decoder(*decoder_up.get(), decoded_thread);
   libipt_decoder.DecodeUntilEndOfTrace();
 }
+
+void lldb_private::trace_intel_pt::DecodeTrace(
+    DecodedThread &decoded_thread, TraceIntelPT &trace_intel_pt,
+    const DenseMap<lldb::core_id_t, llvm::ArrayRef<uint8_t>> &buffers,
+    const std::vector<IntelPTThreadContinousExecution> &executions) {
+  DenseMap<lldb::core_id_t, LibiptDecoder> decoders;
+  for (auto &core_id_buffer : buffers) {
+    Expected<PtInsnDecoderUP> decoder_up =
+        CreateInstructionDecoder(trace_intel_pt, core_id_buffer.second);
+    if (!decoder_up)
+      return decoded_thread.SetAsFailed(decoder_up.takeError());
+
+    if (Error err = SetupMemoryImage(*decoder_up,
+                                     *decoded_thread.GetThread()->GetProcess()))
+      return decoded_thread.SetAsFailed(std::move(err));
+
+    decoders.try_emplace(core_id_buffer.first,
+                         LibiptDecoder(*decoder_up->release(), decoded_thread));
+  }
+
+  bool has_seen_psbs = false;
+  for (size_t i = 0; i < executions.size(); i++) {
+    const IntelPTThreadContinousExecution &execution = executions[i];
+
+    auto variant = execution.thread_execution.variant;
+    // If we haven't seen a PSB yet, then it's fine not to show errors
+    if (has_seen_psbs) {
+      if (execution.intelpt_subtraces.empty()) {
+        decoded_thread.AppendError(createStringError(
+            inconvertibleErrorCode(),
+            formatv("Unable to find intel pt data for thread execution with "
+                    "tsc = {0} on core id = {1}",
+                    execution.thread_execution.GetLowestKnownTSC(),
+                    execution.thread_execution.core_id)));
+      }
+
+      // If the first execution is incomplete because it doesn't have a previous
+      // context switch in its cpu, all good.
+      if (variant == ThreadContinuousExecution::Variant::OnlyEnd ||
+          variant == ThreadContinuousExecution::Variant::HintedStart) {
+        decoded_thread.AppendError(createStringError(
+            inconvertibleErrorCode(),
+            formatv("Thread execution starting at tsc = {0} on core id = {1} "
+                    "doesn't have a matching context switch in.",
+                    execution.thread_execution.GetLowestKnownTSC(),
+                    execution.thread_execution.core_id)));
+      }
+    }
+
+    LibiptDecoder &decoder =
+        decoders.find(execution.thread_execution.core_id)->second;
+    for (const IntelPTThreadSubtrace &intel_pt_execution :
+         execution.intelpt_subtraces) {
+      has_seen_psbs = true;
+      decoder.DecodePSB(intel_pt_execution.psb_offset);
+    }
+
+    // If we haven't seen a PSB yet, then it's fine not to show errors
+    if (has_seen_psbs) {
+      // If the last execution is incomplete because it doesn't have a following
+      // context switch in its cpu, all good.
+      if ((variant == ThreadContinuousExecution::Variant::OnlyStart &&
+           i + 1 != executions.size()) ||
+          variant == ThreadContinuousExecution::Variant::HintedEnd) {
+        decoded_thread.AppendError(createStringError(
+            inconvertibleErrorCode(),
+            formatv("Thread execution starting at tsc = {0} on core id = {1} "
+                    "doesn't have a matching context switch out",
+                    execution.thread_execution.GetLowestKnownTSC(),
+                    execution.thread_execution.core_id)));
+      }
+    }
+  }
+}
+
+bool IntelPTThreadContinousExecution::operator<(
+    const IntelPTThreadContinousExecution &o) const {
+  // As the context switch might be incomplete, we look first for the first real
+  // PSB packet, which is a valid TSC. Otherwise, We query the thread execution
+  // itself for some tsc.
+  auto get_tsc = [](const IntelPTThreadContinousExecution &exec) {
+    return exec.intelpt_subtraces.empty()
+               ? exec.thread_execution.GetLowestKnownTSC()
+               : exec.intelpt_subtraces.front().tsc;
+  };
+
+  return get_tsc(*this) < get_tsc(o);
+}
+
+Expected<std::vector<IntelPTThreadSubtrace>>
+lldb_private::trace_intel_pt::SplitTraceInContinuousExecutions(
+    TraceIntelPT &trace_intel_pt, llvm::ArrayRef<uint8_t> buffer) {
+  Expected<PtInsnDecoderUP> decoder_up =
+      CreateInstructionDecoder(trace_intel_pt, buffer);
+  if (!decoder_up)
+    return decoder_up.takeError();
+
+  pt_insn_decoder *decoder = decoder_up.get().get();
+
+  std::vector<IntelPTThreadSubtrace> executions;
+
+  int status = pte_ok;
+  while (!IsLibiptError(status = pt_insn_sync_forward(decoder))) {
+    uint64_t tsc;
+    if (IsLibiptError(pt_insn_time(decoder, &tsc, nullptr, nullptr)))
+      return createStringError(inconvertibleErrorCode(),
+                               "intel pt trace doesn't have TSC timestamps");
+
+    uint64_t psb_offset;
+    pt_insn_get_sync_offset(decoder,
+                            &psb_offset); // this can't fail because we got here
+
+    executions.push_back({
+        tsc,
+        psb_offset,
+    });
+  }
+  return executions;
+}
diff --git a/lldb/source/Plugins/Trace/intel-pt/PerfContextSwitchDecoder.h b/lldb/source/Plugins/Trace/intel-pt/PerfContextSwitchDecoder.h
new file mode 100644
--- /dev/null
+++ b/lldb/source/Plugins/Trace/intel-pt/PerfContextSwitchDecoder.h
@@ -0,0 +1,146 @@
+//===-- PerfContextSwitchDecoder.h --======----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_TRACE_INTEL_PT_PERFCONTEXTSWITCHDECODER_H
+#define LLDB_SOURCE_PLUGINS_TRACE_INTEL_PT_PERFCONTEXTSWITCHDECODER_H
+
+#include "lldb/Utility/TraceIntelPTGDBRemotePackets.h"
+#include "lldb/lldb-types.h"
+
+#include "llvm/Support/Error.h"
+
+#include <vector>
+
+namespace lldb_private {
+namespace trace_intel_pt {
+
+/// This class indicates the time interval in which a thread was running
+/// continuously on a cpu core.
+///
+/// Note: we use the terms CPU and cores interchangeably.
+struct ThreadContinuousExecution {
+
+  /// In most cases both the start and end of a continuous execution can be
+  /// accurately recovered from the context switch trace, but in some cases one
+  /// of these endpoints might be guessed or not known at all, due to contention
+  /// problems in the trace or because tracing was interrupted, e.g. with ioctl
+  /// calls, which causes gaps in the trace. Because of that, we identify which
+  /// situation we fall into with the following variants.
+  enum class Variant {
+    /// Both endpoints are known.
+    Complete,
+    /// The end is known and we have a lower bound for the start, i.e. the
+    /// previous execution in the same core happens strictly before the hinted
+    /// start.
+    HintedStart,
+    /// The start is known and we have an upper bound for the end, i.e. the next
+    /// execution  in the same core happens strictly after the hinted end.
+    HintedEnd,
+    /// We only know the start. This might be the last entry of a core trace.
+    OnlyStart,
+    /// We only know the end. This might be the first entry or a core trace.
+    OnlyEnd,
+  } variant;
+
+  /// \return
+  ///   The lowest tsc that we are sure of, i.e. not hinted.
+  uint64_t GetLowestKnownTSC() const;
+
+  /// \return
+  ///   The known or hinted start tsc, or 0 if the variant is \a OnlyEnd.
+  uint64_t GetStartTSC() const;
+
+  /// \return
+  ///   The known or hinted end tsc, or max \a uint64_t if the variant is \a
+  ///   OnlyStart.
+  uint64_t GetEndTSC() const;
+
+  /// Constructors for the different variants of this object
+  ///
+  /// \{
+  static ThreadContinuousExecution
+  CreateCompleteExecution(lldb::core_id_t core_id, lldb::tid_t tid,
+                          lldb::pid_t pid, uint64_t start, uint64_t end);
+
+  static ThreadContinuousExecution
+  CreateHintedStartExecution(lldb::core_id_t core_id, lldb::tid_t tid,
+                             lldb::pid_t pid, uint64_t hinted_start,
+                             uint64_t end);
+
+  static ThreadContinuousExecution
+  CreateHintedEndExecution(lldb::core_id_t core_id, lldb::tid_t tid,
+                           lldb::pid_t pid, uint64_t start,
+                           uint64_t hinted_end);
+
+  static ThreadContinuousExecution
+  CreateOnlyEndExecution(lldb::core_id_t core_id, lldb::tid_t tid,
+                         lldb::pid_t pid, uint64_t end);
+
+  static ThreadContinuousExecution
+  CreateOnlyStartExecution(lldb::core_id_t core_id, lldb::tid_t tid,
+                           lldb::pid_t pid, uint64_t start);
+  /// \}
+
+  union {
+    struct {
+      uint64_t start;
+      uint64_t end;
+    } complete;
+    struct {
+      uint64_t start;
+    } only_start;
+    struct {
+      uint64_t end;
+    } only_end;
+    /// The following 'hinted' structures are useful when there are contention
+    /// problems in the trace
+    struct {
+      uint64_t hinted_start;
+      uint64_t end;
+    } hinted_start;
+    struct {
+      uint64_t start;
+      uint64_t hinted_end;
+    } hinted_end;
+  } tscs;
+
+  lldb::core_id_t core_id;
+  lldb::tid_t tid;
+  lldb::pid_t pid;
+
+private:
+  /// We keep this constructor private to force the usage of the static named
+  /// constructors.
+  ThreadContinuousExecution(lldb::core_id_t core_id, lldb::tid_t tid,
+                            lldb::pid_t pid)
+      : core_id(core_id), tid(tid), pid(pid) {}
+};
+
+/// Decodes a context switch trace gotten with perf_event_open.
+///
+/// \param[in] data
+///   The context switch trace in binary format.
+///
+/// \param[i] core_id
+///   The core_id where the trace were gotten from.
+///
+/// \param[in] tsc_conversion
+///   The conversion values used to confert nanoseconds to TSC.
+///
+/// \return
+///   A list of continuous executions recovered from the raw trace sorted by
+///   time, or an \a llvm::Error if the data is malformed.
+llvm::Expected<std::vector<ThreadContinuousExecution>>
+DecodePerfContextSwitchTrace(llvm::ArrayRef<uint8_t> data,
+                             lldb::core_id_t core_id,
+                             const LinuxPerfZeroTscConversion &tsc_conversion);
+
+} // namespace trace_intel_pt
+} // namespace lldb_private
+
+#endif // LLDB_SOURCE_PLUGINS_TRACE_INTEL_PT_PERFCONTEXTSWITCHDECODER_H
diff --git a/lldb/source/Plugins/Trace/intel-pt/PerfContextSwitchDecoder.cpp b/lldb/source/Plugins/Trace/intel-pt/PerfContextSwitchDecoder.cpp
new file mode 100644
--- /dev/null
+++ b/lldb/source/Plugins/Trace/intel-pt/PerfContextSwitchDecoder.cpp
@@ -0,0 +1,281 @@
+//===-- PerfContextSwitchDecoder.cpp --======------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PerfContextSwitchDecoder.h"
+
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::trace_intel_pt;
+using namespace llvm;
+
+/// Copied from <linux/perf_event.h> to avoid depending on perf_event.h on
+/// non-linux platforms.
+/// \{
+struct perf_event_header {
+  uint32_t type;
+  uint16_t misc;
+  uint16_t size;
+};
+
+#define PERF_RECORD_MISC_SWITCH_OUT (1 << 13)
+#define PERF_RECORD_MAX 19
+#define PERF_RECORD_SWITCH_CPU_WIDE 15
+/// \}
+
+/// Record found in the perf_event context switch traces. It might contain
+/// additional fields in memory, but header.size should have the actual size
+/// of the record.
+struct PerfContextSwitchRecord {
+  struct perf_event_header header;
+  uint32_t next_prev_pid;
+  uint32_t next_prev_tid;
+  uint32_t pid, tid;
+  uint64_t time_in_nanos;
+
+  bool IsOut() const { return header.misc & PERF_RECORD_MISC_SWITCH_OUT; }
+
+  bool IsContextSwitchRecord() const {
+    return header.type == PERF_RECORD_SWITCH_CPU_WIDE;
+  }
+
+  /// \return
+  ///   An \a llvm::Error if the record looks obviously wrong, or \a
+  ///   llvm::Error::success() otherwise.
+  Error SanityCheck() const {
+    // A record of too many uint64_t's or more should mean that the data is
+    // wrong
+    if (header.size == 0 || header.size > sizeof(uint64_t) * 1000)
+      return createStringError(
+          inconvertibleErrorCode(),
+          formatv("A record of {0} bytes was found.", header.size));
+
+    // We add some numbers to PERF_RECORD_MAX because some systems might have
+    // custom records. In any case, we are looking only for abnormal data.
+    if (header.type >= PERF_RECORD_MAX + 100)
+      return createStringError(
+          inconvertibleErrorCode(),
+          formatv("Invalid record type {0} was found.", header.type));
+    return Error::success();
+  }
+};
+
+/// Record produced after parsing the raw context switch trace produce by
+/// perf_event. A major difference between this struct and
+/// PerfContextSwitchRecord is that this one uses tsc instead of nanos.
+struct ContextSwitchRecord {
+  uint64_t tsc;
+  /// Whether the switch is in or out
+  bool is_out;
+  /// pid = 0 and tid = 0 indicate the swapper or idle process, which normally
+  /// runs after a context switch out of a normal user thread.
+  lldb::pid_t pid;
+  lldb::tid_t tid;
+
+  bool IsOut() const { return is_out; }
+
+  bool IsIn() const { return !is_out; }
+};
+
+uint64_t ThreadContinuousExecution::GetLowestKnownTSC() const {
+  switch (variant) {
+  case Variant::Complete:
+    return tscs.complete.start;
+  case Variant::OnlyStart:
+    return tscs.only_start.start;
+  case Variant::OnlyEnd:
+    return tscs.only_end.end;
+  case Variant::HintedEnd:
+    return tscs.hinted_end.start;
+  case Variant::HintedStart:
+    return tscs.hinted_start.end;
+  }
+}
+
+uint64_t ThreadContinuousExecution::GetStartTSC() const {
+  switch (variant) {
+  case Variant::Complete:
+    return tscs.complete.start;
+  case Variant::OnlyStart:
+    return tscs.only_start.start;
+  case Variant::OnlyEnd:
+    return 0;
+  case Variant::HintedEnd:
+    return tscs.hinted_end.start;
+  case Variant::HintedStart:
+    return tscs.hinted_start.hinted_start;
+  }
+}
+
+uint64_t ThreadContinuousExecution::GetEndTSC() const {
+  switch (variant) {
+  case Variant::Complete:
+    return tscs.complete.end;
+  case Variant::OnlyStart:
+    return std::numeric_limits<uint64_t>::max();
+  case Variant::OnlyEnd:
+    return tscs.only_end.end;
+  case Variant::HintedEnd:
+    return tscs.hinted_end.hinted_end;
+  case Variant::HintedStart:
+    return tscs.hinted_start.end;
+  }
+}
+
+ThreadContinuousExecution ThreadContinuousExecution::CreateCompleteExecution(
+    lldb::core_id_t core_id, lldb::tid_t tid, lldb::pid_t pid, uint64_t start,
+    uint64_t end) {
+  ThreadContinuousExecution o(core_id, tid, pid);
+  o.variant = Variant::Complete;
+  o.tscs.complete.start = start;
+  o.tscs.complete.end = end;
+  return o;
+}
+
+ThreadContinuousExecution ThreadContinuousExecution::CreateHintedStartExecution(
+    lldb::core_id_t core_id, lldb::tid_t tid, lldb::pid_t pid,
+    uint64_t hinted_start, uint64_t end) {
+  ThreadContinuousExecution o(core_id, tid, pid);
+  o.variant = Variant::HintedStart;
+  o.tscs.hinted_start.hinted_start = hinted_start;
+  o.tscs.hinted_start.end = end;
+  return o;
+}
+
+ThreadContinuousExecution ThreadContinuousExecution::CreateHintedEndExecution(
+    lldb::core_id_t core_id, lldb::tid_t tid, lldb::pid_t pid, uint64_t start,
+    uint64_t hinted_end) {
+  ThreadContinuousExecution o(core_id, tid, pid);
+  o.variant = Variant::HintedEnd;
+  o.tscs.hinted_end.start = start;
+  o.tscs.hinted_end.hinted_end = hinted_end;
+  return o;
+}
+
+ThreadContinuousExecution ThreadContinuousExecution::CreateOnlyEndExecution(
+    lldb::core_id_t core_id, lldb::tid_t tid, lldb::pid_t pid, uint64_t end) {
+  ThreadContinuousExecution o(core_id, tid, pid);
+  o.variant = Variant::OnlyEnd;
+  o.tscs.only_end.end = end;
+  return o;
+}
+
+ThreadContinuousExecution ThreadContinuousExecution::CreateOnlyStartExecution(
+    lldb::core_id_t core_id, lldb::tid_t tid, lldb::pid_t pid, uint64_t start) {
+  ThreadContinuousExecution o(core_id, tid, pid);
+  o.variant = Variant::OnlyStart;
+  o.tscs.only_start.start = start;
+  return o;
+}
+
+static Error RecoverExecutionsFromConsecutiveRecords(
+    core_id_t core_id, const LinuxPerfZeroTscConversion &tsc_conversion,
+    const ContextSwitchRecord &current_record,
+    const Optional<ContextSwitchRecord> &prev_record,
+    std::function<void(const ThreadContinuousExecution &execution)>
+        on_new_execution) {
+  if (!prev_record) {
+    if (current_record.IsOut()) {
+      on_new_execution(ThreadContinuousExecution::CreateOnlyEndExecution(
+          core_id, current_record.tid, current_record.pid, current_record.tsc));
+    }
+    // The 'in' case will be handled later when we try to look for its end
+    return Error::success();
+  }
+
+  const ContextSwitchRecord &prev = *prev_record;
+  if (prev.tsc >= current_record.tsc)
+    return createStringError(
+        inconvertibleErrorCode(),
+        formatv("A context switch record doesn't happen after the previous "
+                "record. Previous TSC= {0}, current TSC = {1}.",
+                prev.tsc, current_record.tsc));
+
+  if (current_record.IsIn() && prev.IsIn()) {
+    // We found two consecutive ins, which means that we didn't capture
+    // the end of the previous execution.
+    on_new_execution(ThreadContinuousExecution::CreateHintedEndExecution(
+        core_id, prev.tid, prev.pid, prev.tsc, current_record.tsc - 1));
+  } else if (current_record.IsOut() && prev.IsOut()) {
+    // We found two consecutive outs, that means that we didn't capture
+    // the beginning of the current execution.
+    on_new_execution(ThreadContinuousExecution::CreateHintedStartExecution(
+        core_id, current_record.tid, current_record.pid, prev.tsc + 1,
+        current_record.tsc));
+  } else if (current_record.IsOut() && prev.IsIn()) {
+    if (current_record.pid == prev.pid && current_record.tid == prev.tid) {
+      /// A complete execution
+      on_new_execution(ThreadContinuousExecution::CreateCompleteExecution(
+          core_id, current_record.tid, current_record.pid, prev.tsc,
+          current_record.tsc));
+    } else {
+      // An out after the in of a different thread. The first one doesn't
+      // have an end, and the second one doesn't have a start.
+      on_new_execution(ThreadContinuousExecution::CreateHintedEndExecution(
+          core_id, prev.tid, prev.pid, prev.tsc, current_record.tsc - 1));
+      on_new_execution(ThreadContinuousExecution::CreateHintedStartExecution(
+          core_id, current_record.tid, current_record.pid, prev.tsc + 1,
+          current_record.tsc));
+    }
+  }
+  return Error::success();
+}
+
+#include <fstream>
+
+Expected<std::vector<ThreadContinuousExecution>>
+lldb_private::trace_intel_pt::DecodePerfContextSwitchTrace(
+    ArrayRef<uint8_t> data, core_id_t core_id,
+    const LinuxPerfZeroTscConversion &tsc_conversion) {
+
+  std::vector<ThreadContinuousExecution> executions;
+
+  // This offset is used to create the error message in case of failures.
+  size_t offset = 0;
+
+  auto do_decode = [&]() -> Error {
+    Optional<ContextSwitchRecord> prev_record;
+    while (offset < data.size()) {
+      const PerfContextSwitchRecord &perf_record =
+          *reinterpret_cast<const PerfContextSwitchRecord *>(data.data() +
+                                                             offset);
+      if (Error err = perf_record.SanityCheck())
+        return err;
+
+      if (perf_record.IsContextSwitchRecord()) {
+        ContextSwitchRecord record{
+            tsc_conversion.ToTSC(perf_record.time_in_nanos),
+            perf_record.IsOut(), static_cast<lldb::pid_t>(perf_record.pid),
+            static_cast<lldb::tid_t>(perf_record.tid)};
+
+        if (Error err = RecoverExecutionsFromConsecutiveRecords(
+                core_id, tsc_conversion, record, prev_record,
+                [&](const ThreadContinuousExecution &execution) {
+                  executions.push_back(execution);
+                }))
+          return err;
+
+        prev_record = record;
+      }
+      offset += perf_record.header.size;
+    }
+
+    // We might have an incomplete last record
+    if (prev_record && prev_record->IsIn())
+      executions.push_back(ThreadContinuousExecution::CreateOnlyStartExecution(
+          core_id, prev_record->tid, prev_record->pid, prev_record->tsc));
+    return Error::success();
+  };
+
+  if (Error err = do_decode())
+    return createStringError(inconvertibleErrorCode(),
+                             formatv("Malformed perf context switch trace for "
+                                     "cpu {0} at offset {1}. {2}",
+                                     core_id, offset,
+                                     toString(std::move(err))));
+
+  return executions;
+}
diff --git a/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTJSONStructs.cpp b/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTJSONStructs.cpp
--- a/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTJSONStructs.cpp
+++ b/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTJSONStructs.cpp
@@ -49,8 +49,10 @@
 }
 
 json::Value toJSON(const JSONThread &thread) {
-  return json::Object{{"tid", thread.tid},
-                      {"traceBuffer", thread.trace_buffer}};
+  json::Object obj{{"tid", thread.tid}};
+  if (thread.trace_buffer)
+    obj["traceBuffer"] = *thread.trace_buffer;
+  return obj;
 }
 
 bool fromJSON(const json::Value &value, JSONThread &thread, Path path) {
diff --git a/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.h b/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.h
--- a/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.h
+++ b/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.h
@@ -9,108 +9,27 @@
 #ifndef LLDB_SOURCE_PLUGINS_TRACE_INTEL_PT_TRACEINTELPTMULTICOREDECODER_H
 #define LLDB_SOURCE_PLUGINS_TRACE_INTEL_PT_TRACEINTELPTMULTICOREDECODER_H
 
+#include "LibiptDecoder.h"
+#include "PerfContextSwitchDecoder.h"
 #include "ThreadDecoder.h"
 
 namespace lldb_private {
 namespace trace_intel_pt {
 
-/// This class indicates the time interval in which a thread was running
-/// continuously on a cpu core.
-///
-/// In most cases both endpoints of the intervals can be accurately recovered
-/// from a context switch trace, but in some cases one of these endpoints might
-/// be guessed or not known at all, due to contention problems in the trace or
-/// because tracing was interrupted.
-///
-/// Note: we use the terms CPU and cores interchangeably.
-struct ThreadContinuousExecution {
-  enum class Variant {
-    /// Both endpoints are known
-    Complete,
-    /// The end is known and we have a guess for the start
-    HintedStart,
-    /// The start is known and we have a guess for the end
-    HintedEnd,
-    /// We only know the start. This might be the last entry of a core trace.
-    OnlyStart,
-    /// We only know the end. This might be the first entry or a core trace.
-    OnlyEnd,
-  } variant;
-
-  union {
-    struct {
-      uint64_t start;
-      uint64_t end;
-    } complete;
-    struct {
-      uint64_t start;
-    } only_start;
-    struct {
-      uint64_t end;
-    } only_end;
-    /// The following 'hinted' structures are useful when there are contention
-    /// problems in the trace
-    struct {
-      uint64_t hinted_start;
-      uint64_t end;
-    } hinted_start;
-    struct {
-      uint64_t start;
-      uint64_t hinted_end;
-    } hinted_end;
-  } tscs;
-
-  lldb::core_id_t core_id;
-  lldb::tid_t tid;
-
-  /// \return
-  ///   A tsc that we are certain of, either the start or the end.
-  uint64_t GetErrorFreeTSC() const;
-
-  /// Constructors for the different variants of this object
-  ///
-  /// \{
-  static ThreadContinuousExecution
-  CreateCompleteExecution(lldb::core_id_t core_id, lldb::tid_t tid,
-                          uint64_t start, uint64_t end);
-
-  static ThreadContinuousExecution
-  CreateHintedStartExecution(lldb::core_id_t core_id, lldb::tid_t tid,
-                             uint64_t hinted_start, uint64_t end);
-
-  static ThreadContinuousExecution
-  CreateHintedEndExecution(lldb::core_id_t core_id, lldb::tid_t tid,
-                           uint64_t start, uint64_t hinted_end);
-
-  static ThreadContinuousExecution
-  CreateOnlyEndExecution(lldb::core_id_t core_id, lldb::tid_t tid,
-                         uint64_t end);
-
-  static ThreadContinuousExecution
-  CreateOnlyStartExecution(lldb::core_id_t core_id, lldb::tid_t tid,
-                           uint64_t start);
-  /// \}
-
-  /// Comparator by TSCs
-  bool operator<(const ThreadContinuousExecution &o) const;
-
-private:
-  ThreadContinuousExecution(lldb::core_id_t core_id, lldb::tid_t tid)
-      : core_id(core_id), tid(tid) {}
-};
-
 /// Class used to decode a multi-core Intel PT trace. It assumes that each
 /// thread could have potentially been executed on different cores. It uses a
 /// context switch trace per CPU with timestamps to identify which thread owns
 /// each Intel PT decoded instruction and in which order. It also assumes that
 /// the Intel PT data and context switches might have gaps in their traces due
-/// to contention or race conditions.
+/// to contention or race conditions. Finally, it assumes that a tid is not
+/// repeated twice for two different threads because of the shortness of the
+/// intel pt trace.
 class TraceIntelPTMultiCoreDecoder {
 public:
   /// \param[in] core_ids
   ///   The list of cores where the traced programs were running on.
   ///
-  /// \param[in] tid
+  /// \param[in] tids
   ///   The full list of tids that were traced.
   ///
   /// \param[in] tsc_conversion
@@ -143,16 +62,24 @@
   /// by thread.
   llvm::Error DecodeContextSwitchTraces();
 
+  /// Produce a mapping from thread ids to the list of continuos executions with
+  /// their associated intel pt subtraces.
+  llvm::Expected<
+      llvm::DenseMap<lldb::tid_t, std::vector<IntelPTThreadContinousExecution>>>
+  CorrelateContextSwitchesAndIntelPtTraces();
+
   TraceIntelPT &m_trace;
   std::set<lldb::core_id_t> m_cores;
   std::set<lldb::tid_t> m_tids;
   llvm::Optional<
-      llvm::DenseMap<lldb::tid_t, std::vector<ThreadContinuousExecution>>>
+      llvm::DenseMap<lldb::tid_t, std::vector<IntelPTThreadContinousExecution>>>
       m_continuous_executions_per_thread;
+  llvm::DenseMap<lldb::tid_t, DecodedThreadSP> m_decoded_threads;
   LinuxPerfZeroTscConversion m_tsc_conversion;
   /// This variable will be non-None if a severe error happened during the setup
   /// of the decoder.
   llvm::Optional<std::string> m_setup_error;
+  uint64_t m_unattributed_intelpt_subtraces;
 };
 
 } // namespace trace_intel_pt
diff --git a/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.cpp b/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.cpp
--- a/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.cpp
+++ b/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.cpp
@@ -12,225 +12,11 @@
 
 #include "llvm/Support/Error.h"
 
-#include <linux/perf_event.h>
-
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::trace_intel_pt;
 using namespace llvm;
 
-struct PerfContextSwitchRecord {
-  struct perf_event_header header;
-  uint32_t next_prev_pid;
-  uint32_t next_prev_tid;
-  uint32_t pid, tid;
-  uint64_t time_in_nanos;
-
-  bool IsOut() const { return header.misc & PERF_RECORD_MISC_SWITCH_OUT; }
-};
-
-struct ContextSwitchRecord {
-  uint64_t tsc;
-  bool is_out;
-  /// A pid of 0 indicates an execution in the kernel
-  lldb::pid_t pid;
-  lldb::tid_t tid;
-
-  bool IsOut() const { return is_out; }
-
-  bool IsIn() const { return !is_out; }
-};
-
-uint64_t ThreadContinuousExecution::GetErrorFreeTSC() const {
-  switch (variant) {
-  case Variant::Complete:
-    return tscs.complete.start; // end would also work
-  case Variant::HintedStart:
-    return tscs.hinted_start.end;
-  case Variant::HintedEnd:
-    return tscs.hinted_end.start;
-  case Variant::OnlyEnd:
-    return tscs.only_end.end;
-  case Variant::OnlyStart:
-    return tscs.only_start.start;
-  }
-}
-
-ThreadContinuousExecution ThreadContinuousExecution::CreateCompleteExecution(
-    lldb::core_id_t core_id, lldb::tid_t tid, uint64_t start, uint64_t end) {
-  ThreadContinuousExecution o(core_id, tid);
-  o.variant = Variant::Complete;
-  o.tscs.complete.start = start;
-  o.tscs.complete.end = end;
-  return o;
-}
-
-ThreadContinuousExecution ThreadContinuousExecution::CreateHintedStartExecution(
-    lldb::core_id_t core_id, lldb::tid_t tid, uint64_t hinted_start,
-    uint64_t end) {
-  ThreadContinuousExecution o(core_id, tid);
-  o.variant = Variant::HintedStart;
-  o.tscs.hinted_start.hinted_start = hinted_start;
-  o.tscs.hinted_start.end = end;
-  return o;
-}
-
-ThreadContinuousExecution ThreadContinuousExecution::CreateHintedEndExecution(
-    lldb::core_id_t core_id, lldb::tid_t tid, uint64_t start,
-    uint64_t hinted_end) {
-  ThreadContinuousExecution o(core_id, tid);
-  o.variant = Variant::HintedEnd;
-  o.tscs.hinted_end.start = start;
-  o.tscs.hinted_end.hinted_end = hinted_end;
-  return o;
-}
-
-ThreadContinuousExecution ThreadContinuousExecution::CreateOnlyEndExecution(
-    lldb::core_id_t core_id, lldb::tid_t tid, uint64_t end) {
-  ThreadContinuousExecution o(core_id, tid);
-  o.variant = Variant::OnlyEnd;
-  o.tscs.only_end.end = end;
-  return o;
-}
-
-ThreadContinuousExecution ThreadContinuousExecution::CreateOnlyStartExecution(
-    lldb::core_id_t core_id, lldb::tid_t tid, uint64_t start) {
-  ThreadContinuousExecution o(core_id, tid);
-  o.variant = Variant::OnlyStart;
-  o.tscs.only_start.start = start;
-  return o;
-}
-
-bool ThreadContinuousExecution::operator<(
-    const ThreadContinuousExecution &o) const {
-  // We can compare by GetErrorFreeTSC because context switches across CPUs can
-  // be sorted by any of its TSC.
-  return GetErrorFreeTSC() < o.GetErrorFreeTSC();
-}
-
-/// Tries to recover a continuous execution by analyzing two consecutive context
-/// switch records.
-static Error
-HandleContextSwitch(core_id_t core_id,
-                    const LinuxPerfZeroTscConversion &tsc_conversion,
-                    const ContextSwitchRecord &record,
-                    const Optional<ContextSwitchRecord> &prev_record,
-                    std::function<void(ThreadContinuousExecution &&execution)>
-                        on_new_thread_execution) {
-  if (!prev_record) {
-    if (record.IsOut())
-      on_new_thread_execution(ThreadContinuousExecution::CreateOnlyEndExecution(
-          core_id, record.tid, record.tsc));
-    // The 'in' case will be handled later when we try to look for its end
-    return Error::success();
-  }
-
-  const ContextSwitchRecord &prev = *prev_record;
-  if (prev.tsc > record.tsc)
-    return createStringError(
-        inconvertibleErrorCode(),
-        formatv("A context switch record out doesn't happen after the previous "
-                "record. Previous TSC= {0}, current TSC = {1}.",
-                prev.tsc, record.tsc));
-
-  if (record.IsIn() && prev.IsIn()) {
-    // We found two consecutive ins, which means that we didn't capture
-    // the end of the previous execution.
-    on_new_thread_execution(ThreadContinuousExecution::CreateHintedEndExecution(
-        core_id, prev.tid, prev.tsc, record.tsc - 1));
-  } else if (record.IsOut() && prev.IsOut()) {
-    // We found two consecutive outs, that means that we didn't capture
-    // the beginning of the current execution.
-    on_new_thread_execution(
-        ThreadContinuousExecution::CreateHintedStartExecution(
-            core_id, record.tid, prev.tsc + 1, record.tsc));
-  } else if (record.IsOut() && prev.IsIn()) {
-    if (record.pid == prev.pid && record.tid == prev.tid) {
-      /// A complete execution
-      on_new_thread_execution(
-          ThreadContinuousExecution::CreateCompleteExecution(
-              core_id, record.tid, prev.tsc, record.tsc));
-    } else {
-      // An out after the in of a different thread. The first one doesn't
-      // have an end, and the second one doesn't have a start.
-      on_new_thread_execution(
-          ThreadContinuousExecution::CreateHintedEndExecution(
-              core_id, prev.tid, prev.tsc, record.tsc - 1));
-      on_new_thread_execution(
-          ThreadContinuousExecution::CreateHintedStartExecution(
-              core_id, record.tid, prev.tsc + 1, record.tsc));
-    }
-  }
-  return Error::success();
-}
-
-/// Decodes a context switch trace gotten with perf_event_open.
-///
-/// \param[in] data
-///   The context switch trace in binary format.
-///
-/// \param[i] core_id
-///   The core_id where the trace were gotten from.
-///
-/// \param[in] tsc_conversion
-///   The conversion values used to confert nanoseconds to TSC.
-///
-/// \param[in] on_new_thread_execution
-///   Callback to be invoked whenever a continuous execution is recovered from
-///   the trace.
-static Error DecodePerfContextSwitchTrace(
-    ArrayRef<uint8_t> data, core_id_t core_id,
-    const LinuxPerfZeroTscConversion &tsc_conversion,
-    std::function<void(ThreadContinuousExecution &&execution)>
-        on_new_thread_execution) {
-  auto CreateError = [&](size_t offset, auto error) -> Error {
-    return createStringError(inconvertibleErrorCode(),
-                             formatv("Malformed perf context switch trace for "
-                                     "cpu {0} at offset {1}. {2}",
-                                     core_id, offset, error));
-  };
-
-  Optional<ContextSwitchRecord> prev_record;
-  for (size_t offset = 0; offset < data.size();) {
-    const PerfContextSwitchRecord &perf_record =
-        *reinterpret_cast<const PerfContextSwitchRecord *>(data.data() +
-                                                           offset);
-    // A record of 1000 uint64_t's or more should mean that the data is wrong
-    if (perf_record.header.size == 0 ||
-        perf_record.header.size > sizeof(uint64_t) * 1000)
-      return CreateError(offset, formatv("A record of {0} bytes was found.",
-                                         perf_record.header.size));
-
-    // We add + 100 to this record because some systems might have custom
-    // records. In any case, we are looking only for abnormal data.
-    if (perf_record.header.type >= PERF_RECORD_MAX + 100)
-      return CreateError(offset, formatv("Invalid record type {0} was found.",
-                                         perf_record.header.type));
-
-    if (perf_record.header.type == PERF_RECORD_SWITCH_CPU_WIDE) {
-      ContextSwitchRecord record{tsc_conversion.ToTSC(std::chrono::nanoseconds(
-                                     perf_record.time_in_nanos)),
-                                 perf_record.IsOut(),
-                                 static_cast<lldb::pid_t>(perf_record.pid),
-                                 static_cast<lldb::tid_t>(perf_record.tid)};
-
-      if (Error err = HandleContextSwitch(core_id, tsc_conversion, record,
-                                          prev_record, on_new_thread_execution))
-        return CreateError(offset, toString(std::move(err)));
-
-      prev_record = record;
-    }
-    offset += perf_record.header.size;
-  }
-
-  // We might have an incomplete last record
-  if (prev_record && prev_record->IsIn())
-    on_new_thread_execution(ThreadContinuousExecution::CreateOnlyStartExecution(
-        core_id, prev_record->tid, prev_record->tsc));
-
-  return Error::success();
-}
-
 TraceIntelPTMultiCoreDecoder::TraceIntelPTMultiCoreDecoder(
     TraceIntelPT &trace, ArrayRef<core_id_t> core_ids, ArrayRef<tid_t> tids,
     const LinuxPerfZeroTscConversion &tsc_conversion)
@@ -245,10 +31,91 @@
   if (Error err = DecodeContextSwitchTraces())
     return std::make_shared<DecodedThread>(thread.shared_from_this(),
                                            std::move(err));
+  auto it = m_decoded_threads.find(thread.GetID());
+  if (it != m_decoded_threads.end())
+    return it->second;
+
+  DecodedThreadSP decoded_thread_sp =
+      std::make_shared<DecodedThread>(thread.shared_from_this());
+
+  Error err = m_trace.OnCoresBinaryDataRead(
+      m_cores, IntelPTDataKinds::kTraceBuffer,
+      [&](const DenseMap<core_id_t, ArrayRef<uint8_t>> buffers) -> Error {
+        auto it = m_continuous_executions_per_thread->find(thread.GetID());
+        if (it != m_continuous_executions_per_thread->end())
+          DecodeTrace(*decoded_thread_sp, m_trace, buffers, it->second);
+
+        return Error::success();
+      });
+  if (err)
+    decoded_thread_sp->SetAsFailed(std::move(err));
+
+  m_decoded_threads.try_emplace(thread.GetID(), decoded_thread_sp);
+  return decoded_thread_sp;
+}
 
-  return std::make_shared<DecodedThread>(
-      thread.shared_from_this(),
-      createStringError(inconvertibleErrorCode(), "unimplemented"));
+llvm::Expected<
+    llvm::DenseMap<lldb::tid_t, std::vector<IntelPTThreadContinousExecution>>>
+TraceIntelPTMultiCoreDecoder::CorrelateContextSwitchesAndIntelPtTraces() {
+  llvm::DenseMap<lldb::tid_t, std::vector<IntelPTThreadContinousExecution>>
+      continuous_executions_per_thread;
+
+  for (core_id_t core_id : m_cores) {
+    std::vector<IntelPTThreadSubtrace> intel_pt_executions;
+
+    Error err = m_trace.OnCoreBinaryDataRead(
+        core_id, IntelPTDataKinds::kTraceBuffer,
+        [&](ArrayRef<uint8_t> data) -> Error {
+          Expected<std::vector<IntelPTThreadSubtrace>> split_trace =
+              SplitTraceInContinuousExecutions(m_trace, data);
+          if (!split_trace)
+            return split_trace.takeError();
+
+          intel_pt_executions = std::move(*split_trace);
+          return Error::success();
+        });
+    if (err)
+      return std::move(err);
+
+    // We'll be iterating through the thread continuous executions and the intel
+    // pt subtraces sorted by time.
+    auto it = intel_pt_executions.begin();
+    auto on_new_thread_execution =
+        [&](ThreadContinuousExecution thread_execution) {
+          IntelPTThreadContinousExecution execution(thread_execution);
+
+          for (; it != intel_pt_executions.end() &&
+                 it->tsc < thread_execution.GetEndTSC();
+               it++) {
+            if (it->tsc > thread_execution.GetStartTSC()) {
+              execution.intelpt_subtraces.push_back(*it);
+            } else {
+              m_unattributed_intelpt_subtraces++;
+            }
+          }
+          continuous_executions_per_thread[thread_execution.tid].push_back(
+              execution);
+        };
+    err = m_trace.OnCoreBinaryDataRead(
+        core_id, IntelPTDataKinds::kPerfContextSwitchTrace,
+        [&](ArrayRef<uint8_t> data) -> Error {
+          Expected<std::vector<ThreadContinuousExecution>> executions =
+              DecodePerfContextSwitchTrace(data, core_id, m_tsc_conversion);
+          if (!executions)
+            return executions.takeError();
+          for (const ThreadContinuousExecution &exec : *executions)
+            on_new_thread_execution(exec);
+          return Error::success();
+        });
+    if (err)
+      return std::move(err);
+  }
+  // We now sort the executions of each thread to have them ready for
+  // instruction decoding
+  for (auto &tid_executions : continuous_executions_per_thread)
+    std::sort(tid_executions.second.begin(), tid_executions.second.end());
+
+  return continuous_executions_per_thread;
 }
 
 Error TraceIntelPTMultiCoreDecoder::DecodeContextSwitchTraces() {
@@ -258,38 +125,20 @@
   if (m_continuous_executions_per_thread)
     return Error::success();
 
-  m_continuous_executions_per_thread.emplace();
-
-  auto do_decode = [&]() -> Error {
-    // We'll decode all context switch traces, identify continuous executions
-    // and group them by thread.
-    for (core_id_t core_id : m_cores) {
-      Error err = m_trace.OnCoreBinaryDataRead(
-          core_id, IntelPTDataKinds::kPerfContextSwitchTrace,
-          [&](ArrayRef<uint8_t> data) -> Error {
-            return DecodePerfContextSwitchTrace(
-                data, core_id, m_tsc_conversion,
-                [&](const ThreadContinuousExecution &execution) {
-                  (*m_continuous_executions_per_thread)[execution.tid]
-                      .push_back(execution);
-                });
-          });
-      if (err) {
-        m_setup_error = toString(std::move(err));
-        return createStringError(inconvertibleErrorCode(),
-                                 m_setup_error->c_str());
-      }
-    }
-    // We now sort the executions of each to have them ready for instruction
-    // decoding
-    for (auto &tid_executions : *m_continuous_executions_per_thread)
-      std::sort(tid_executions.second.begin(), tid_executions.second.end());
-
-    return Error::success();
-  };
-
-  return m_trace.GetTimer().ForGlobal().TimeTask<Error>(
-      "Context switch trace decoding", do_decode);
+  Error err = m_trace.GetTimer().ForGlobal().TimeTask<Error>(
+      "Context switch and Intel PT traces correlation", [&]() -> Error {
+        if (auto correlation = CorrelateContextSwitchesAndIntelPtTraces()) {
+          m_continuous_executions_per_thread.emplace(std::move(*correlation));
+          return Error::success();
+        } else {
+          return correlation.takeError();
+        }
+      });
+  if (err) {
+    m_setup_error = toString(std::move(err));
+    return createStringError(inconvertibleErrorCode(), m_setup_error->c_str());
+  }
+  return Error::success();
 }
 
 size_t TraceIntelPTMultiCoreDecoder::GetNumContinuousExecutionsForThread(
diff --git a/lldb/source/Target/Trace.cpp b/lldb/source/Target/Trace.cpp
--- a/lldb/source/Target/Trace.cpp
+++ b/lldb/source/Target/Trace.cpp
@@ -128,8 +128,8 @@
 
 Optional<uint64_t> Trace::GetLiveCoreBinaryDataSize(lldb::core_id_t core_id,
                                                     llvm::StringRef kind) {
-  auto it = m_live_core_data.find(core_id);
-  if (it == m_live_core_data.end())
+  auto it = m_live_core_data_sizes.find(core_id);
+  if (it == m_live_core_data_sizes.end())
     return None;
   std::unordered_map<std::string, uint64_t> &single_core_data = it->second;
   auto single_thread_data_it = single_core_data.find(kind.str());
@@ -211,6 +211,8 @@
   m_stop_id = new_stop_id;
   m_live_thread_data.clear();
   m_live_refresh_error.reset();
+  m_live_core_data_sizes.clear();
+  m_live_core_data.clear();
   m_cores.reset();
 
   auto HandleError = [&](Error &&err) -> const char * {
@@ -246,7 +248,7 @@
     for (const TraceCoreState &core_state : *live_process_state->cores) {
       m_cores->push_back(core_state.core_id);
       for (const TraceBinaryData &item : core_state.binary_data)
-        m_live_core_data[core_state.core_id][item.kind] = item.size;
+        m_live_core_data_sizes[core_state.core_id][item.kind] = item.size;
     }
     LLDB_LOG(log, "== Found {0} cpu cores being traced",
             live_process_state->cores->size());
@@ -353,10 +355,18 @@
 llvm::Error Trace::OnLiveCoreBinaryDataRead(lldb::core_id_t core_id,
                                             llvm::StringRef kind,
                                             OnBinaryDataReadCallback callback) {
+  auto core_data_entries = m_live_core_data.find(core_id);
+  if (core_data_entries != m_live_core_data.end()) {
+    auto core_data = core_data_entries->second.find(kind.str());
+    if (core_data != core_data_entries->second.end())
+      return callback(core_data->second);
+  }
+
   Expected<std::vector<uint8_t>> data = GetLiveCoreBinaryData(core_id, kind);
   if (!data)
     return data.takeError();
-  return callback(*data);
+  auto it = m_live_core_data[core_id].insert({kind.str(), std::move(*data)});
+  return callback(it.first->second);
 }
 
 llvm::Error Trace::OnDataFileRead(FileSpec file,
@@ -401,6 +411,28 @@
     return OnPostMortemThreadBinaryDataRead(tid, kind, callback);
 }
 
+llvm::Error
+Trace::OnCoresBinaryDataRead(const std::set<lldb::core_id_t> core_ids,
+                             llvm::StringRef kind,
+                             OnCoresBinaryDataReadCallback callback) {
+  DenseMap<core_id_t, ArrayRef<uint8_t>> buffers;
+
+  std::function<Error(std::set<core_id_t>::iterator)> process_core =
+      [&](std::set<core_id_t>::iterator core_id) -> Error {
+    if (core_id == core_ids.end())
+      return callback(buffers);
+
+    return OnCoreBinaryDataRead(*core_id, kind,
+                                [&](ArrayRef<uint8_t> data) -> Error {
+                                  buffers.try_emplace(*core_id, data);
+                                  auto next_id = core_id;
+                                  next_id++;
+                                  return process_core(next_id);
+                                });
+  };
+  return process_core(core_ids.begin());
+}
+
 llvm::Error Trace::OnCoreBinaryDataRead(lldb::core_id_t core_id,
                                         llvm::StringRef kind,
                                         OnBinaryDataReadCallback callback) {
diff --git a/lldb/source/Utility/TraceIntelPTGDBRemotePackets.cpp b/lldb/source/Utility/TraceIntelPTGDBRemotePackets.cpp
--- a/lldb/source/Utility/TraceIntelPTGDBRemotePackets.cpp
+++ b/lldb/source/Utility/TraceIntelPTGDBRemotePackets.cpp
@@ -50,18 +50,15 @@
   return base;
 }
 
-std::chrono::nanoseconds
-LinuxPerfZeroTscConversion::ToNanos(uint64_t tsc) const {
+uint64_t LinuxPerfZeroTscConversion::ToNanos(uint64_t tsc) const {
   uint64_t quot = tsc >> time_shift;
   uint64_t rem_flag = (((uint64_t)1 << time_shift) - 1);
   uint64_t rem = tsc & rem_flag;
-  return std::chrono::nanoseconds{time_zero + quot * time_mult +
-                                  ((rem * time_mult) >> time_shift)};
+  return time_zero + quot * time_mult + ((rem * time_mult) >> time_shift);
 }
 
-uint64_t
-LinuxPerfZeroTscConversion::ToTSC(std::chrono::nanoseconds nanos) const {
-  uint64_t time = nanos.count() - time_zero;
+uint64_t LinuxPerfZeroTscConversion::ToTSC(uint64_t nanos) const {
+  uint64_t time = nanos - time_zero;
   uint64_t quot = time / time_mult;
   uint64_t rem = time % time_mult;
   return (quot << time_shift) + (rem << time_shift) / time_mult;
diff --git a/lldb/test/API/commands/trace/TestTraceLoad.py b/lldb/test/API/commands/trace/TestTraceLoad.py
--- a/lldb/test/API/commands/trace/TestTraceLoad.py
+++ b/lldb/test/API/commands/trace/TestTraceLoad.py
@@ -9,6 +9,18 @@
     mydir = TestBase.compute_mydir(__file__)
     NO_DEBUG_INFO_TESTCASE = True
 
+    def testLoadMultiCoreTrace(self):
+        src_dir = self.getSourceDir()
+        trace_definition_file = os.path.join(src_dir, "intelpt-multi-core-trace", "trace.json")
+        self.expect("trace load -v " + trace_definition_file, substrs=["intel-pt"])
+        self.expect("thread trace dump instructions 2 -t",
+          substrs=["19521: [tsc=0x008fb5211c143fd8] error: expected tracing enabled event",
+                   "m.out`foo() + 65 at multi_thread.cpp:12:21",
+                   "19520: [tsc=0x008fb5211bfbc69e] 0x0000000000400ba7    jg     0x400bb3"])
+        self.expect("thread trace dump instructions 3 -t",
+          substrs=["67910: [tsc=0x008fb5211bfdf270] 0x0000000000400bd7    addl   $0x1, -0x4(%rbp)",
+                   "m.out`bar() + 26 at multi_thread.cpp:20:6"])
+
     def testLoadTrace(self):
         src_dir = self.getSourceDir()
         trace_definition_file = os.path.join(src_dir, "intelpt-trace", "trace.json")
diff --git a/lldb/test/API/commands/trace/intelpt-multi-core-trace/cores/45.intelpt_trace b/lldb/test/API/commands/trace/intelpt-multi-core-trace/cores/45.intelpt_trace
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001

literal 0
Hc$@<O00001

diff --git a/lldb/test/API/commands/trace/intelpt-multi-core-trace/cores/45.perf_context_switch_trace b/lldb/test/API/commands/trace/intelpt-multi-core-trace/cores/45.perf_context_switch_trace
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001

literal 0
Hc$@<O00001

diff --git a/lldb/test/API/commands/trace/intelpt-multi-core-trace/cores/51.intelpt_trace b/lldb/test/API/commands/trace/intelpt-multi-core-trace/cores/51.intelpt_trace
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001

literal 0
Hc$@<O00001

diff --git a/lldb/test/API/commands/trace/intelpt-multi-core-trace/cores/51.perf_context_switch_trace b/lldb/test/API/commands/trace/intelpt-multi-core-trace/cores/51.perf_context_switch_trace
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001

literal 0
Hc$@<O00001

diff --git a/lldb/test/API/commands/trace/intelpt-multi-core-trace/modules/m.out b/lldb/test/API/commands/trace/intelpt-multi-core-trace/modules/m.out
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001

literal 0
Hc$@<O00001

diff --git a/lldb/test/API/commands/trace/intelpt-multi-core-trace/multi_thread.cpp b/lldb/test/API/commands/trace/intelpt-multi-core-trace/multi_thread.cpp
new file mode 100644
--- /dev/null
+++ b/lldb/test/API/commands/trace/intelpt-multi-core-trace/multi_thread.cpp
@@ -0,0 +1,34 @@
+#include <thread>
+#include <cstring>
+#include <unistd.h>
+using namespace std;
+
+bool done = false;
+void foo() {
+  int x = 0;
+  for (int i = 0; i < 10000; i++)
+    x++;
+  sleep(1);
+  for (int i = 0; i < 10000; i++)
+    x++;
+  done = true;
+}
+
+void bar() {
+  int y = 0;
+  while (!done) {
+    y++;
+  }
+  printf("bar %d\n", y);
+}
+
+int main() {
+  std::thread first(foo);
+  std::thread second(bar);
+  first.join();
+  second.join();
+
+  printf("complete\n");
+  return 0;
+
+}
diff --git a/lldb/test/API/commands/trace/intelpt-multi-core-trace/trace.json b/lldb/test/API/commands/trace/intelpt-multi-core-trace/trace.json
new file mode 100644
--- /dev/null
+++ b/lldb/test/API/commands/trace/intelpt-multi-core-trace/trace.json
@@ -0,0 +1,51 @@
+{
+  "cores": [
+    {
+      "contextSwitchTrace": "/tmp/trace8/cores/45.perf_context_switch_trace",
+      "coreId": 45,
+      "traceBuffer": "/tmp/trace8/cores/45.intelpt_trace"
+    },
+    {
+      "contextSwitchTrace": "/tmp/trace8/cores/51.perf_context_switch_trace",
+      "coreId": 51,
+      "traceBuffer": "/tmp/trace8/cores/51.intelpt_trace"
+    }
+  ],
+  "cpuInfo": {
+    "family": 6,
+    "model": 85,
+    "stepping": 4,
+    "vendor": "GenuineIntel"
+  },
+  "processes": [
+    {
+      "modules": [
+        {
+          "file": "modules/m.out",
+          "systemPath": "/tmp/m.out",
+          "loadAddress": 4194304,
+          "uuid": "AEFB0D59-233F-80FF-6D3C-4DED498534CF-11017B3B"
+        }
+      ],
+      "pid": 3497234,
+      "threads": [
+        {
+          "tid": 3497234
+        },
+        {
+          "tid": 3497496
+        },
+        {
+          "tid": 3497497
+        }
+      ],
+      "triple": "x86_64-unknown-linux-gnu"
+    }
+  ],
+  "tscPerfZeroConversion": {
+    "timeMult": 1076264588,
+    "timeShift": 31,
+    "timeZero": 18433473881008870804
+  },
+  "type": "intel-pt"
+}
diff --git a/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py b/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py
--- a/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py
+++ b/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py
@@ -233,6 +233,6 @@
         # We must have captured the context switch of when the target resumed
         self.assertTrue(found_non_empty_context_switch)
 
-        self.expect("thread trace dump instructions", substrs=['unimplemented'])
+        self.expect("thread trace dump instructions")
 
         self.traceStopProcess()
diff --git a/lldb/unittests/Process/Linux/PerfTests.cpp b/lldb/unittests/Process/Linux/PerfTests.cpp
--- a/lldb/unittests/Process/Linux/PerfTests.cpp
+++ b/lldb/unittests/Process/Linux/PerfTests.cpp
@@ -76,14 +76,14 @@
   if (!tsc_after_sleep)
     GTEST_SKIP() << toString(tsc_after_sleep.takeError());
 
-  std::chrono::nanoseconds converted_tsc_diff =
+  uint64_t converted_tsc_diff =
       params->ToNanos(*tsc_after_sleep) - params->ToNanos(*tsc_before_sleep);
 
   std::chrono::microseconds acceptable_overhead(500);
 
-  ASSERT_GE(converted_tsc_diff.count(), SLEEP_NANOS.count());
-  ASSERT_LT(converted_tsc_diff.count(),
-            (SLEEP_NANOS + acceptable_overhead).count());
+  ASSERT_GE(converted_tsc_diff, static_cast<uint64_t>(SLEEP_NANOS.count()));
+  ASSERT_LT(converted_tsc_diff,
+            static_cast<uint64_t>((SLEEP_NANOS + acceptable_overhead).count()));
 }
 
 #endif // __x86_64__