diff --git a/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.h b/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.h --- a/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.h +++ b/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.h @@ -12,17 +12,124 @@ #include "intel-pt.h" #include "DecodedThread.h" + #include "forward-declarations.h" namespace lldb_private { namespace trace_intel_pt { +struct IntelPTContinuousExecution { + uint64_t tsc; + lldb::core_id_t core_id; + uint64_t psb_offset; + uint64_t next_psb_offset; +}; + +/// This class indicates the time interval in which a thread was running +/// continuously on a cpu core. +/// +/// In most cases both endpoints of the intervals can be accurately recovered +/// from a context switch trace, but in some cases one of these endpoints might +/// be guessed or not known at all, due to contention problems in the trace or +/// because tracing was interrupted. +/// +/// Note: we use the terms CPU and cores interchangeably. +struct ThreadContinuousExecution { + enum class Variant { + /// Both endpoints are known + Complete, + /// The end is known and we have a guess for the start + HintedStart, + /// The start is known and we have a guess for the end + HintedEnd, + /// We only know the start. This might be the last entry of a core trace. + OnlyStart, + /// We only know the end. This might be the first entry or a core trace. + OnlyEnd, + } variant; + + union { + struct { + uint64_t start; + uint64_t end; + } complete; + struct { + uint64_t start; + } only_start; + struct { + uint64_t end; + } only_end; + /// The following 'hinted' structures are useful when there are contention + /// problems in the trace + struct { + uint64_t hinted_start; + uint64_t end; + } hinted_start; + struct { + uint64_t start; + uint64_t hinted_end; + } hinted_end; + } tscs; + + lldb::core_id_t core_id; + lldb::tid_t tid; + + /// \return + /// A tsc that we are certain of, either the start or the end. + uint64_t GetErrorFreeTSC() const; + + /// Constructors for the different variants of this object + /// + /// \{ + static ThreadContinuousExecution + CreateCompleteExecution(lldb::core_id_t core_id, lldb::tid_t tid, + uint64_t start, uint64_t end); + + static ThreadContinuousExecution + CreateHintedStartExecution(lldb::core_id_t core_id, lldb::tid_t tid, + uint64_t hinted_start, uint64_t end); + + static ThreadContinuousExecution + CreateHintedEndExecution(lldb::core_id_t core_id, lldb::tid_t tid, + uint64_t start, uint64_t hinted_end); + + static ThreadContinuousExecution + CreateOnlyEndExecution(lldb::core_id_t core_id, lldb::tid_t tid, + uint64_t end); + + static ThreadContinuousExecution + CreateOnlyStartExecution(lldb::core_id_t core_id, lldb::tid_t tid, + uint64_t start); + /// \} + + void + AddIntelPTExecution(const IntelPTContinuousExecution &intel_pt_execution); + + /// Comparator by TSCs + bool operator<(const ThreadContinuousExecution &o) const; + + std::vector m_intel_pt_executions; + +private: + ThreadContinuousExecution(lldb::core_id_t core_id, lldb::tid_t tid) + : core_id(core_id), tid(tid) {} +}; + /// Decode a raw Intel PT trace given in \p buffer and append the decoded /// instructions and errors in \p decoded_thread. It uses the low level libipt /// library underneath. void DecodeTrace(DecodedThread &decoded_thread, TraceIntelPT &trace_intel_pt, llvm::ArrayRef buffer); +void DecodeTrace( + DecodedThread &decoded_thread, TraceIntelPT &trace_intel_pt, + const llvm::DenseMap> &buffers, + const std::vector &executions); + +llvm::Expected> +SplitTraceInContinuousExecutions(TraceIntelPT &trace_intel_pt, + llvm::ArrayRef buffer); + } // namespace trace_intel_pt } // namespace lldb_private diff --git a/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp b/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp --- a/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp +++ b/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp @@ -26,6 +26,96 @@ explicit operator bool() const { return has_tsc == eLazyBoolYes; } }; +class LibiptSplitter { +public: + LibiptSplitter(pt_insn_decoder &decoder) : m_decoder(decoder) {} + + std::vector SplitTraceInContinuousExecutions() { + int status = pte_ok; + std::vector executions; + while (!IsLibiptError(status = FindNextSynchronizationPoint())) { + if (IsLibiptError(status = ProcessPTEvents(status))) + continue; + + Optional tsc = FetchTsc(); + if (!tsc) + continue; + uint64_t psb_offset = 0; + if (!IsLibiptError(pt_insn_get_sync_offset(&m_decoder, &psb_offset))) { + executions.push_back({ + *tsc, + 0, // core id + psb_offset, // uint64_t offset; + 0, // next_psb_offset + }); + } + } + for (size_t i = 0; i + 1 < executions.size(); i++) + executions[i].next_psb_offset = executions[i + 1].psb_offset; + return executions; + } + +private: + Optional FetchTsc() { + uint64_t tsc; + int tsc_status; + if (IsLibiptError(tsc_status = + pt_insn_time(&m_decoder, &tsc, nullptr, nullptr))) { + return None; + } + return tsc; + } + + int ProcessPTEvents(int status) { + while (status & pts_event_pending) { + pt_event event; + status = pt_insn_event(&m_decoder, &event, sizeof(event)); + if (IsLibiptError(status)) { + return status; + } + } + return pte_ok; + } + int FindNextSynchronizationPoint() { + // Try to sync the decoder. If it fails, then get the decoder_offset and + // try to sync again from the next synchronization point. If the + // new_decoder_offset is same as decoder_offset then we can't move to the + // next synchronization point. Otherwise, keep resyncing until either end + // of trace stream (eos) is reached or pt_insn_sync_forward() passes. + int status = pt_insn_sync_forward(&m_decoder); + + if (!IsEndOfStream(status) && IsLibiptError(status)) { + uint64_t decoder_offset = 0; + int errcode_off = pt_insn_get_offset(&m_decoder, &decoder_offset); + if (!IsLibiptError(errcode_off)) { // we could get the offset + while (true) { + status = pt_insn_sync_forward(&m_decoder); + if (!IsLibiptError(status) || IsEndOfStream(status)) + break; + + uint64_t new_decoder_offset = 0; + errcode_off = pt_insn_get_offset(&m_decoder, &new_decoder_offset); + if (IsLibiptError(errcode_off)) + break; // We can't further synchronize. + else if (new_decoder_offset <= decoder_offset) { + // We tried resyncing the decoder and it didn't make any progress + // because the offset didn't change. We will not make any further + // progress. Hence, we stop in this situation. + break; + } + // We'll try again starting from a new offset. + decoder_offset = new_decoder_offset; + } + } + } + + return status; + } + + pt_insn_decoder &m_decoder; + TscInfo m_tsc_info; +}; + /// Class that decodes a raw buffer for a single thread using the low level /// libipt library. /// @@ -62,6 +152,26 @@ } } + void ResetDecoderOffset(uint64_t offset) { + int error = pte_ok; + if (IsLibiptError(error = pt_insn_sync_set(&m_decoder, offset))) + m_decoded_thread.Append(DecodedInstruction(error)); + } + + void DecodeUntilOffset(uint64_t end_offset) { + int status = pte_ok; + while (!IsLibiptError(status = FindNextSynchronizationPoint())) { + uint64_t cur_offset; + pt_insn_get_offset(&m_decoder, &cur_offset); + if (cur_offset > end_offset) + break; + // We have synchronized, so we can start decoding instructions and + // events. + // Multiple loops indicate gaps in the trace. + DecodeInstructionsAndEvents(status, end_offset); + } + } + private: /// Invoke the low level function \a pt_insn_next and store the decoded /// instruction in the given \a DecodedInstruction. @@ -80,8 +190,15 @@ /// /// \param[in] status /// The status that was result of synchronizing to the most recent PSB. - void DecodeInstructionsAndEvents(int status) { + void DecodeInstructionsAndEvents(int status, + Optional end_offset = None) { while (DecodedInstruction insn = ProcessPTEvents(status)) { + if (end_offset) { + uint64_t cur_offset; + pt_insn_get_offset(&m_decoder, &cur_offset); + if (cur_offset > *end_offset) + break; + } // The status returned by DecodeNextInstruction will need to be processed // by ProcessPTEvents in the next loop if it is not an error. if (IsLibiptError(status = DecodeNextInstruction(insn))) { @@ -264,8 +381,7 @@ std::unique_ptr; static Expected -CreateInstructionDecoder(DecodedThread &decoded_thread, - TraceIntelPT &trace_intel_pt, +CreateInstructionDecoder(TraceIntelPT &trace_intel_pt, ArrayRef buffer) { Expected cpu_info = trace_intel_pt.GetCPUInfo(); if (!cpu_info) @@ -287,25 +403,73 @@ pt_insn_decoder *decoder_ptr = pt_insn_alloc_decoder(&config); if (!decoder_ptr) return make_error(-pte_nomem); - PtInsnDecoderUP decoder_up(decoder_ptr, DecoderDeleter); - pt_image *image = pt_insn_get_image(decoder_ptr); - Process *process = decoded_thread.GetThread()->GetProcess().get(); + return PtInsnDecoderUP(decoder_ptr, DecoderDeleter); +} + +static Error SetupMemoryImages(PtInsnDecoderUP &decoder_up, Process &process) { + pt_image *image = pt_insn_get_image(decoder_up.get()); + int status = pte_ok; if (IsLibiptError( - status = pt_image_set_callback(image, ReadProcessMemory, process))) + status = pt_image_set_callback(image, ReadProcessMemory, &process))) return make_error(status); - return decoder_up; + return Error::success(); } void lldb_private::trace_intel_pt::DecodeTrace(DecodedThread &decoded_thread, TraceIntelPT &trace_intel_pt, ArrayRef buffer) { Expected decoder_up = - CreateInstructionDecoder(decoded_thread, trace_intel_pt, buffer); + CreateInstructionDecoder(trace_intel_pt, buffer); if (!decoder_up) return decoded_thread.SetAsFailed(decoder_up.takeError()); + if (Error err = SetupMemoryImages(*decoder_up, + *decoded_thread.GetThread()->GetProcess())) + return decoded_thread.SetAsFailed(std::move(err)); + LibiptDecoder libipt_decoder(*decoder_up.get(), decoded_thread); libipt_decoder.DecodeUntilEndOfTrace(); } + +void lldb_private::trace_intel_pt::DecodeTrace( + DecodedThread &decoded_thread, TraceIntelPT &trace_intel_pt, + const DenseMap> &buffers, + const std::vector &executions) { + DenseMap decoders; + for (auto &core_id_buffer : buffers) { + Expected decoder_up = + CreateInstructionDecoder(trace_intel_pt, core_id_buffer.second); + if (!decoder_up) + return decoded_thread.SetAsFailed(decoder_up.takeError()); + + if (Error err = SetupMemoryImages( + *decoder_up, *decoded_thread.GetThread()->GetProcess())) + return decoded_thread.SetAsFailed(std::move(err)); + + decoders.try_emplace(core_id_buffer.first, + LibiptDecoder(*decoder_up.get(), decoded_thread)); + } + + for (const ThreadContinuousExecution &execution : executions) { + LibiptDecoder &decoder = decoders.find(execution.core_id)->second; + for (const IntelPTContinuousExecution &intel_pt_execution : + execution.m_intel_pt_executions) { + decoder.ResetDecoderOffset(intel_pt_execution.psb_offset); + decoder.DecodeUntilOffset(intel_pt_execution.next_psb_offset); + } + } +} + +Expected> +lldb_private::trace_intel_pt::SplitTraceInContinuousExecutions( + TraceIntelPT &trace_intel_pt, llvm::ArrayRef buffer) { + Expected decoder_up = + CreateInstructionDecoder(trace_intel_pt, buffer); + if (!decoder_up) + return decoder_up.takeError(); + + LibiptSplitter splitter(*decoder_up.get()); + return splitter.SplitTraceInContinuousExecutions(); +} diff --git a/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.h b/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.h --- a/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.h +++ b/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.h @@ -9,96 +9,12 @@ #ifndef LLDB_SOURCE_PLUGINS_TRACE_INTEL_PT_TRACEINTELPTMULTICOREDECODER_H #define LLDB_SOURCE_PLUGINS_TRACE_INTEL_PT_TRACEINTELPTMULTICOREDECODER_H +#include "LibiptDecoder.h" #include "ThreadDecoder.h" namespace lldb_private { namespace trace_intel_pt { -/// This class indicates the time interval in which a thread was running -/// continuously on a cpu core. -/// -/// In most cases both endpoints of the intervals can be accurately recovered -/// from a context switch trace, but in some cases one of these endpoints might -/// be guessed or not known at all, due to contention problems in the trace or -/// because tracing was interrupted. -/// -/// Note: we use the terms CPU and cores interchangeably. -struct ThreadContinuousExecution { - enum class Variant { - /// Both endpoints are known - Complete, - /// The end is known and we have a guess for the start - HintedStart, - /// The start is known and we have a guess for the end - HintedEnd, - /// We only know the start. This might be the last entry of a core trace. - OnlyStart, - /// We only know the end. This might be the first entry or a core trace. - OnlyEnd, - } variant; - - union { - struct { - uint64_t start; - uint64_t end; - } complete; - struct { - uint64_t start; - } only_start; - struct { - uint64_t end; - } only_end; - /// The following 'hinted' structures are useful when there are contention - /// problems in the trace - struct { - uint64_t hinted_start; - uint64_t end; - } hinted_start; - struct { - uint64_t start; - uint64_t hinted_end; - } hinted_end; - } tscs; - - lldb::core_id_t core_id; - lldb::tid_t tid; - - /// \return - /// A tsc that we are certain of, either the start or the end. - uint64_t GetErrorFreeTSC() const; - - /// Constructors for the different variants of this object - /// - /// \{ - static ThreadContinuousExecution - CreateCompleteExecution(lldb::core_id_t core_id, lldb::tid_t tid, - uint64_t start, uint64_t end); - - static ThreadContinuousExecution - CreateHintedStartExecution(lldb::core_id_t core_id, lldb::tid_t tid, - uint64_t hinted_start, uint64_t end); - - static ThreadContinuousExecution - CreateHintedEndExecution(lldb::core_id_t core_id, lldb::tid_t tid, - uint64_t start, uint64_t hinted_end); - - static ThreadContinuousExecution - CreateOnlyEndExecution(lldb::core_id_t core_id, lldb::tid_t tid, - uint64_t end); - - static ThreadContinuousExecution - CreateOnlyStartExecution(lldb::core_id_t core_id, lldb::tid_t tid, - uint64_t start); - /// \} - - /// Comparator by TSCs - bool operator<(const ThreadContinuousExecution &o) const; - -private: - ThreadContinuousExecution(lldb::core_id_t core_id, lldb::tid_t tid) - : core_id(core_id), tid(tid) {} -}; - /// Class used to decode a multi-core Intel PT trace. It assumes that each /// thread could have potentially been executed on different cores. It uses a /// context switch trace per CPU with timestamps to identify which thread owns @@ -143,6 +59,11 @@ /// by thread. llvm::Error DecodeContextSwitchTraces(); + void DecodeFromCore( + std::set::iterator core_id, + llvm::DenseMap> &buffers, + DecodedThread &decoded_thread, Thread &thread); + TraceIntelPT &m_trace; std::set m_cores; std::set m_tids; diff --git a/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.cpp b/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.cpp --- a/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.cpp +++ b/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTMultiCoreDecoder.cpp @@ -101,6 +101,11 @@ return o; } +void ThreadContinuousExecution::AddIntelPTExecution( + const IntelPTContinuousExecution &intel_pt_execution) { + m_intel_pt_executions.push_back(intel_pt_execution); +} + bool ThreadContinuousExecution::operator<( const ThreadContinuousExecution &o) const { // We can compare by GetErrorFreeTSC because context switches across CPUs can @@ -181,7 +186,7 @@ static Error DecodePerfContextSwitchTrace( ArrayRef data, core_id_t core_id, const LinuxPerfZeroTscConversion &tsc_conversion, - std::function + std::function on_new_thread_execution) { auto CreateError = [&](size_t offset, auto error) -> Error { return createStringError(inconvertibleErrorCode(), @@ -241,14 +246,36 @@ return m_tids.count(tid); } +void TraceIntelPTMultiCoreDecoder::DecodeFromCore( + std::set::iterator core_id, + DenseMap> &buffers, + DecodedThread &decoded_thread, Thread &thread) { + if (core_id == m_cores.end()) { + DecodeTrace( + decoded_thread, m_trace, buffers, + m_continuous_executions_per_thread->find(thread.GetID())->second); + return; + } + cantFail(m_trace.OnCoreBinaryDataRead( + *core_id, IntelPTDataKinds::kTraceBuffer, + [&](ArrayRef data) -> Error { + buffers.try_emplace(*core_id, data); + auto next_id = core_id; + next_id++; + DecodeFromCore(next_id, buffers, decoded_thread, thread); + return Error::success(); + })); +} + DecodedThreadSP TraceIntelPTMultiCoreDecoder::Decode(Thread &thread) { if (Error err = DecodeContextSwitchTraces()) return std::make_shared(thread.shared_from_this(), std::move(err)); - - return std::make_shared( - thread.shared_from_this(), - createStringError(inconvertibleErrorCode(), "unimplemented")); + DecodedThreadSP decoded_thread_sp = + std::make_shared(thread.shared_from_this()); + DenseMap> buffers; + DecodeFromCore(m_cores.begin(), buffers, *decoded_thread_sp, thread); + return decoded_thread_sp; } Error TraceIntelPTMultiCoreDecoder::DecodeContextSwitchTraces() { @@ -260,36 +287,80 @@ m_continuous_executions_per_thread.emplace(); - auto do_decode = [&]() -> Error { - // We'll decode all context switch traces, identify continuous executions - // and group them by thread. + auto correlate_context_switches_and_intel_pt_traces = [&]() -> Error { for (core_id_t core_id : m_cores) { + std::vector intel_pt_executions; + Error err = m_trace.OnCoreBinaryDataRead( + core_id, IntelPTDataKinds::kTraceBuffer, + [&](ArrayRef data) -> Error { + Expected> split_trace = + SplitTraceInContinuousExecutions(m_trace, data); + if (!split_trace) + return split_trace.takeError(); + intel_pt_executions = std::move(*split_trace); + + return Error::success(); + }); + if (err) + return err; + + auto it = intel_pt_executions.begin(); + auto on_new_thread_execution = [&](ThreadContinuousExecution execution) { + if (execution.variant == ThreadContinuousExecution::Variant::Complete) { + for (; it != intel_pt_executions.end() && + it->tsc < execution.tscs.complete.end; + it++) { + if (it->tsc > execution.tscs.complete.start) { + execution.AddIntelPTExecution(*it); + } + } + } + (*m_continuous_executions_per_thread)[execution.tid].push_back( + std::move(execution)); + }; + err = m_trace.OnCoreBinaryDataRead( core_id, IntelPTDataKinds::kPerfContextSwitchTrace, [&](ArrayRef data) -> Error { - return DecodePerfContextSwitchTrace( - data, core_id, m_tsc_conversion, - [&](const ThreadContinuousExecution &execution) { - (*m_continuous_executions_per_thread)[execution.tid] - .push_back(execution); - }); + return DecodePerfContextSwitchTrace(data, core_id, m_tsc_conversion, + on_new_thread_execution); }); - if (err) { - m_setup_error = toString(std::move(err)); - return createStringError(inconvertibleErrorCode(), - m_setup_error->c_str()); - } + if (err) + return err; } - // We now sort the executions of each to have them ready for instruction - // decoding + // We now sort the executions of each thread to have them ready for + // instruction decoding for (auto &tid_executions : *m_continuous_executions_per_thread) std::sort(tid_executions.second.begin(), tid_executions.second.end()); + for (auto &tid_executions : *m_continuous_executions_per_thread) { + for (auto &thread_execution : tid_executions.second) { + for (auto &intel_pt_execution : + thread_execution.m_intel_pt_executions) { + printf("%s\n", formatv("tid = {0}, core id = {1}, offset = {2}, " + "in_tsc = {3}, out_tsc = {4}, psb_tsc = {5}", + thread_execution.tid, thread_execution.core_id, + intel_pt_execution.psb_offset, + thread_execution.tscs.complete.start, + thread_execution.tscs.complete.end, + intel_pt_execution.tsc) + .str() + .c_str()); + } + } + } + return Error::success(); }; - return m_trace.GetTimer().ForGlobal().TimeTask( - "Context switch trace decoding", do_decode); + Error err = m_trace.GetTimer().ForGlobal().TimeTask( + "Context switch and Intel PT traces correlation", + correlate_context_switches_and_intel_pt_traces); + if (err) { + m_setup_error = toString(std::move(err)); + return createStringError(inconvertibleErrorCode(), m_setup_error->c_str()); + } + return Error::success(); } size_t TraceIntelPTMultiCoreDecoder::GetNumContinuousExecutionsForThread(