diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst --- a/llvm/docs/CommandGuide/llvm-exegesis.rst +++ b/llvm/docs/CommandGuide/llvm-exegesis.rst @@ -192,10 +192,24 @@ .. option:: -mode=[latency|uops|inverse_throughput|analysis] - Specify the run mode. Note that if you pick `analysis` mode, you also need - to specify at least one of the `-analysis-clusters-output-file=` and - `-analysis-inconsistencies-output-file=`. + Specify the run mode. Note that some modes have additional requirements and options. + `latency` mode can be make use of either RDTSC or LBR. + `latency[LBR]` is only available on X86 (at least `Skylake`). + To run in this mode, a positive value must be specified for `x86-lbr-sample-period` + + In `analysis` mode, you also need to specify at least one of the + `-analysis-clusters-output-file=` and `-analysis-inconsistencies-output-file=`. + +.. option:: -x86-lbr-sample-period= + + Specify the LBR sampling period - how many branches before we take a sample. + When a positive value is specified for this option and when the mode is `latency`, + we will use LBRs for measuring. + On choosing the "right" sampling period, a small value is preferred, but throttling + could occur if the sampling is too frequent. A prime number should be used to + avoid consistently skipping certain blocks. + .. option:: -repetition-mode=[duplicate|loop|min] Specify the repetition mode. `duplicate` will create a large, straight line diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h @@ -20,6 +20,7 @@ #include "BenchmarkResult.h" #include "LlvmState.h" #include "MCInstrDescView.h" +#include "PerfHelper.h" #include "SnippetRepetitor.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/Error.h" diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -7,18 +7,24 @@ //===----------------------------------------------------------------------===// #include +#include +#include #include +#include #include "Assembler.h" +#include "BenchmarkResult.h" #include "BenchmarkRunner.h" #include "Error.h" #include "MCInstrDescView.h" #include "PerfHelper.h" +#include "Target.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/CrashRecoveryContext.h" +#include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Program.h" @@ -78,6 +84,112 @@ const ExecutableFunction Function; BenchmarkRunner::ScratchSpace *const Scratch; }; + +class WorkerThreadFunctionExecutorImpl + : public BenchmarkRunner::FunctionExecutor { +public: + WorkerThreadFunctionExecutorImpl( + const LLVMState &State, + llvm::object::OwningBinary Obj, + BenchmarkRunner::ScratchSpace *Scratch) + : State(State), Function(State.createTargetMachine(), std::move(Obj)), + Scratch(Scratch) {} + + struct ThreadArg { + std::mutex Mutex; + std::unique_ptr CounterPtr; + std::condition_variable ConditionVariable; + bool Crashed; + std::string ErrMsg; + const char *CounterName; + }; + + void workerThread(struct ThreadArg *Arg) const { + auto CounterOrError = + State.getExegesisTarget().createCounter(Arg->CounterName, State); + if (!CounterOrError) { + std::lock_guard Lock(Arg->Mutex); + Arg->Crashed = true; + llvm::errs() << "Error creating counter: " << CounterOrError.takeError(); + Arg->ErrMsg = "Cannot create counter."; + Arg->ConditionVariable.notify_one(); + return; + } + { + std::lock_guard Lock(Arg->Mutex); + Arg->CounterPtr.reset(CounterOrError.get().get()); + } + Arg->ConditionVariable.notify_one(); + { + llvm::CrashRecoveryContext CRC; + llvm::CrashRecoveryContext::Enable(); + const bool Crashed = !CRC.RunSafely([&]() { + Arg->CounterPtr->start(); + Function(Scratch->ptr()); + Arg->CounterPtr->stop(); + }); + llvm::CrashRecoveryContext::Disable(); + { + std::lock_guard Lock(Arg->Mutex); + Arg->CounterPtr.reset(nullptr); + } + // FIXME: Better diagnosis. + if (Crashed) { + Arg->Crashed = true; + return; + } + } + Arg->Crashed = false; + } + + llvm::Expected + runAndMeasure(const char *CounterName) const override { + Scratch->clear(); + struct ThreadArg Arg; + Arg.CounterPtr = nullptr; + Arg.CounterName = CounterName; + std::thread Worker(&WorkerThreadFunctionExecutorImpl::workerThread, this, + &Arg); + { + // Waits until the counter is ready or a "Crashed" signal if + // it could not be created. + std::unique_lock Lock(Arg.Mutex); + Arg.ConditionVariable.wait( + Lock, [&Arg] { return Arg.Crashed || Arg.CounterPtr != nullptr; }); + } + if (Arg.Crashed) { + return make_error( + "Snippet crashed while running in thread. Reason: [" + Arg.ErrMsg + + "]"); + } + + auto ValueOrError = Arg.CounterPtr->readOrError(); + Worker.join(); + if (!ValueOrError) { + return ValueOrError.takeError(); + } + return ValueOrError.get(); + } + + const LLVMState &State; + const ExecutableFunction Function; + BenchmarkRunner::ScratchSpace *const Scratch; +}; + +std::unique_ptr +createFunctionExecutor(InstructionBenchmark::ModeE Mode, const LLVMState &State, + object::OwningBinary ObjectFile, + BenchmarkRunner::ScratchSpace *Scratch) { + const ExegesisTarget &Target = State.getExegesisTarget(); + if (Target.useWorkerThreadForBenchmark(State, Mode)) { + return std::make_unique( + State, std::move(ObjectFile), Scratch); + } + + return std::make_unique(State, std::move(ObjectFile), + Scratch); +} + } // namespace Expected BenchmarkRunner::runConfiguration( @@ -127,6 +239,7 @@ OS)) { return std::move(E); } + const ExecutableFunction EF(State.createTargetMachine(), getObjectFromBuffer(OS.str())); const auto FnBytes = EF.getFunctionBytes(); @@ -161,9 +274,11 @@ ObjectFile = getObjectFromBuffer(OS.str()); } - const FunctionExecutorImpl Executor(State, std::move(ObjectFile), - Scratch.get()); - auto NewMeasurements = runMeasurements(Executor); + std::unique_ptr Executor = + createFunctionExecutor(Mode, State, std::move(ObjectFile), + Scratch.get()); + + auto NewMeasurements = runMeasurements(*Executor); if (Error E = NewMeasurements.takeError()) { if (!E.isA()) return std::move(E); diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.h b/llvm/tools/llvm-exegesis/lib/PerfHelper.h --- a/llvm/tools/llvm-exegesis/lib/PerfHelper.h +++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.h @@ -17,6 +17,8 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/Config/config.h" +#include "llvm/Support/Error.h" +#include #include #include @@ -36,7 +38,7 @@ public: // http://perfmon2.sourceforge.net/manv4/libpfm.html // Events are expressed as strings. e.g. "INSTRUCTION_RETIRED" - explicit PerfEvent(StringRef pfm_event_string); + explicit PerfEvent(StringRef PfmEventString, unsigned SamplingPeriod = 0); PerfEvent(const PerfEvent &) = delete; PerfEvent(PerfEvent &&other); @@ -56,6 +58,9 @@ StringRef getPfmEventString() const; private: + void initPerfEvent(); + void initPerfEventForLbr(unsigned SamplingPeriod); + const std::string EventString; std::string FullQualifiedEventString; perf_event_attr *Attr; @@ -63,24 +68,35 @@ // Uses a valid PerfEvent to configure the Kernel so we can measure the // underlying event. -struct Counter { +class Counter { +public: // event: the PerfEvent to measure. explicit Counter(PerfEvent &&event); Counter(const Counter &) = delete; Counter(Counter &&other) = default; - ~Counter(); + virtual ~Counter(); - void start(); // Starts the measurement of the event. - void stop(); // Stops the measurement of the event. - int64_t read() const; // Return the current value of the counter. + /// Starts the measurement of the event. + virtual void start(); -private: - PerfEvent Event; + /// Stops the measurement of the event. + void stop(); + + /// Returns the current value of the counter. + virtual int64_t read() const; + + /// Returns the current value of the counter or error if it cannot be read. + virtual llvm::Expected readOrError() const { return read(); } + +protected: #ifdef HAVE_LIBPFM int FileDescriptor = -1; -#endif +#endif // HAVE_LIBPFM + +private: + PerfEvent Event; }; } // namespace pfm diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp --- a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp +++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp @@ -8,13 +8,23 @@ #include "PerfHelper.h" #include "llvm/Config/config.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" #include "llvm/Support/raw_ostream.h" +#include #ifdef HAVE_LIBPFM #include "perfmon/perf_event.h" #include "perfmon/pfmlib.h" #include "perfmon/pfmlib_perf_event.h" #endif +#include #include +#include +#include +#include +#include +#include namespace llvm { namespace exegesis { @@ -52,8 +62,7 @@ Other.Attr = nullptr; } -PerfEvent::PerfEvent(StringRef PfmEventString) - : EventString(PfmEventString.str()), Attr(nullptr) { +void PerfEvent::initPerfEvent() { #ifdef HAVE_LIBPFM char *Fstr = nullptr; pfm_perf_encode_arg_t Arg = {}; @@ -77,6 +86,39 @@ #endif } +void PerfEvent::initPerfEventForLbr(unsigned SamplingPeriod) { +#ifdef HAVE_LIBPFM + assert(SamplingPeriod > 0 && "SamplingPeriod must be positive"); + Attr = new perf_event_attr(); + *Attr = {0}; + Attr->size = sizeof(*Attr); + Attr->type = PERF_TYPE_HARDWARE; + // Look up encodings at + // https://download.01.org/perfmon/SKL/skylake_core_v48.json + Attr->config = 0x20c4; // BR_INST_RETIRED.NEAR_TAKEN + Attr->sample_type = PERF_SAMPLE_BRANCH_STACK; + // Don't need to specify "USER" because we've already excluded HV and Kernel. + Attr->branch_sample_type = PERF_SAMPLE_BRANCH_ANY; + Attr->sample_period = SamplingPeriod; + Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH. + Attr->disabled = 1; + Attr->exclude_kernel = 1; + Attr->exclude_hv = 1; + Attr->read_format = PERF_FORMAT_GROUP; + + FullQualifiedEventString = "LBR"; +#endif +} + +PerfEvent::PerfEvent(StringRef PfmEventString, unsigned SamplingPeriod) + : EventString(PfmEventString.str()), Attr(nullptr) { + if (EventString == "LBR") { + initPerfEventForLbr(SamplingPeriod); + } else { + initPerfEvent(); + } +} + StringRef PerfEvent::name() const { return EventString; } bool PerfEvent::valid() const { return !FullQualifiedEventString.empty(); } diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h --- a/llvm/tools/llvm-exegesis/lib/Target.h +++ b/llvm/tools/llvm-exegesis/lib/Target.h @@ -27,6 +27,7 @@ #include "llvm/IR/LegacyPassManager.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Error.h" namespace llvm { namespace exegesis { @@ -65,6 +66,16 @@ explicit ExegesisTarget(ArrayRef CpuPfmCounters) : CpuPfmCounters(CpuPfmCounters) {} + /// Returns true if the benchmark should use a worker-thread. + /// For most target/benchmark-mode, it is not necessary. + virtual bool useWorkerThreadForBenchmark(const LLVMState &, + InstructionBenchmark::ModeE) const { + return false; + } + + virtual Expected> + createCounter(const char *CounterName, const LLVMState &State) const; + // Targets can use this to add target-specific passes in assembleToStream(); virtual void addTargetSpecificPasses(PassManagerBase &PM) const {} diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp --- a/llvm/tools/llvm-exegesis/lib/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/Target.cpp @@ -27,6 +27,12 @@ return nullptr; } +Expected> +ExegesisTarget::createCounter(const char *CounterName, + const LLVMState &) const { + return std::make_unique(pfm::PerfEvent(CounterName)); +} + void ExegesisTarget::registerTarget(ExegesisTarget *Target) { if (FirstTarget == nullptr) { FirstTarget = Target; diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp --- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp @@ -14,15 +14,37 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86MCTargetDesc.h" #include "X86.h" +#include "X86Counter.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Sequence.h" #include "llvm/MC/MCInstBuilder.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" +#include +#include +#include + namespace llvm { namespace exegesis { +static cl::OptionCategory BenchmarkOptions("llvm-exegesis benchmark x86-options"); + +// If a positive value is specified, we are going to use the LBR in +// latency-mode. +// +// Note: +// - A small value is preferred, but too low a value could result in +// throttling. +// - A prime number is preferred to avoid always skipping certain blocks. +// +static cl::opt LbrSamplingPeriod( + "x86-lbr-sample-period", + cl::desc("The sample period (nbranches/sample), used for LBR sampling"), + cl::cat(BenchmarkOptions), cl::init(0)); + // Returns a non-null reason if we cannot handle the memory references in this // instruction. static const char *isInvalidMemoryInstr(const Instruction &Instr) { @@ -559,10 +581,36 @@ #include "X86GenExegesis.inc" namespace { + class ExegesisX86Target : public ExegesisTarget { public: ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {} + bool + useWorkerThreadForBenchmark(const LLVMState &State, + InstructionBenchmark::ModeE Mode) const override { + // If it is Latency with LBR, should use worker-thread + return Mode == InstructionBenchmark::ModeE::Latency && + LbrSamplingPeriod > 0; + } + + Expected> + createCounter(const char *CounterName, + const LLVMState &State) const override { + if (CounterName == State.getPfmCounters().CycleCounter) { + // Can't use LBR without HAVE_LIBPFM, or __linux__ (for now) +#if defined(HAVE_LIBPFM) && defined(__linux__) + return std::make_unique( + PerfEvent("LBR", LbrSamplingPeriod)); +#else + return llvm::make_error( + "LBR counter requested without HAVE_LIBPFM or running on Linux.", + llvm::errc::invalid_argument); +#endif + } + return ExegesisTarget::createCounter(CounterName, State); + } + private: void addTargetSpecificPasses(PassManagerBase &PM) const override; diff --git a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h new file mode 100644 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h @@ -0,0 +1,41 @@ +//===-- X86Counter.h --------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_EXEGESIS_X86COUNTER_H +#define LLVM_TOOLS_LLVM_EXEGESIS_X86COUNTER_H + +#include "../PerfHelper.h" +#include "llvm/Support/Error.h" + +// FIXME: Use appropriate wrappers for poll.h and mman.h +// to support Windows and remove this linux-only guard. +#if defined(__linux__) && defined(HAVE_LIBPFM) + +namespace llvm { +namespace exegesis { + +class X86LbrCounter : public pfm::Counter { +public: + explicit X86LbrCounter(const pfm::PerfEvent &Event); + + virtual ~X86LbrCounter(); + + void start() override; + int64_t read() const override; + llvm::Expected readOrError() const override; + +private: + void *MMappedBuffer = nullptr; +}; + +} // namespace exegesis +} // namespace llvm + +#endif // defined(__linux__) && defined(HAVE_LIBPFM) + +#endif // LLVM_TOOLS_LLVM_EXEGESIS_X86COUNTER_H diff --git a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp new file mode 100644 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp @@ -0,0 +1,246 @@ +//===-- X86Counter.cpp ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "X86Counter.h" + +// FIXME: Use appropriate wrappers for poll.h and mman.h +// to support Windows and remove this linux-only guard. +#ifdef __linux__ +#include "llvm/Support/Errc.h" + +#ifdef HAVE_LIBPFM +#include "perfmon/perf_event.h" +#include "perfmon/pfmlib.h" +#include "perfmon/pfmlib_perf_event.h" +#endif // HAVE_LIBPFM + +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef HAVE_LIBPFM +namespace llvm { +namespace exegesis { + +static constexpr int KBufferPages = 8; +static const int kDataBufferSize = kBufferPages * getpagesize(); + +// Waits for the LBR perf events. +static int pollLbrPerfEvent(const int FileDescriptor) { + struct pollfd PollFd; + PollFd.fd = FileDescriptor; + PollFd.events = POLLIN; + PollFd.revents = 0; + return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */); +} + +// Copies the data-buffer into Buf, given the pointer to MMapped. +static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail, + size_t DataSize) { + // First page is reserved for perf_event_mmap_page. Data buffer starts on + // the next page. + char *Start = reinterpret_cast(MMappedBuffer) + getpagesize(); + // The LBR buffer is a cyclic buffer, we copy data to another buffer. + uint64_t Offset = Tail % kDataBufferSize; + size_t CopySize = kDataBufferSize - Offset; + memcpy(Buf, Start + Offset, CopySize); + if (CopySize >= DataSize) + return; + memcpy(Buf + CopySize, Start, Offset); +} + +// FIXME: Remove this hack. +// Option #1: Make the assembler produce code with a condition var for stopping +// the loop, which can be found statically (at fixed addr) and remove this +// hack of modifying the code while running. +// Option #2: Run benchmarked code in the same thread, and make use of +// Freeze_LBRs_On_PMI +// +// Rewrites the jmp (from the BM loop) with `pop $rbx; ret` to get +// out of the benchmarked code. +// +// pc: Must point at the start of the jmp instruction. +static void patchBasicBlockToEndBenchmarkedLoop(char *pc) const { + void *page = reinterpret_cast(reinterpret_cast(pc) & + ~(getpagesize() - 1)); + mprotect(page, getpagesize(), PROT_READ | PROT_WRITE | PROT_EXEC); + if ((reinterpret_cast(pc) & 0xf) != 0xf) { + // We have at least two bytes in the same cacheline, so we can use an + // atomic write to replace the jmp with pop, ret. + *reinterpret_cast(pc) = 0xc35b; // pop %rbx; ret; + } else { + // pc and pc+1 might be at different cacheline, so we first add the pop, + // ret instructions, and then update jump offset to jump to pop + // instruction. + *reinterpret_cast(pc + 5) = 0xc35b; // pop %rbx; ret; + // An automic update on the displacement. + if (pc[0] == 0xeb /* x86 short jump */) { + // offset is 1 byte + pc[1] = 0x03; + } else { /* x86 near jump */ + // offset is 4 bytes + *reinterpret_cast(pc + 1) = 0; + } + } +} + +// Parses the given data-buffer for stats and fill the CycleArray. +// If data has been extracted successfully, also modifies the code to jump +// out the benchmark loop. +static llvm::Error parseDataBuffer(const char const *DataBuf, size_t DataSize, + std::vector *CycleArray) const { + const char *DataPtr = DataBuf; + while (DataPtr < DataBuf + DataSize) { + struct perf_event_header Header; + memcpy(&Header, DataPtr, sizeof(struct perf_event_header)); + if (Header.type != PERF_RECORD_SAMPLE) { + // Ignores non-sample records. + DataPtr += Header.size; + continue; + } + DataPtr += sizeof(Header); + uint64_t Count = llvm::support::endian::read64(DataPtr, support::native); + DataPtr += sizeof(Count); + + struct perf_branch_entry Entry; + memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); + // Read the perf_branch_entry array. + char *JumpPC = reinterpret_cast(Entry.from); + int64_t MinCycle = Entry.cycles; + for (int i = 0; i < Count; ++i) { + // We use the JumpPC from the entry with min cycle to avoid the entry + // that is returning from kernel. + if (MinCycle > Entry.cycles) { + MinCycle = Entry.cycles; + JumpPC = reinterpret_cast(Entry.from); + } + CycleArray->push_back(Entry.cycles); + if (i == Count - 1) { + // If we're at the last entry, then terminate the loop. + patchBasicBlockToEndBenchmarkedLoop(JumpPC); + return llvm::Error::success(); + } + // Advance to next entry + DataPtr += sizeof(Entry); + memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); + } + } + return llvm::make_error("Unable to read databuffer.", + llvm::errc::io_error); +} + +X86LbrCounter::X86LbrCounter(const pfm::PerfEvent &Event) : Counter(Event) { + // First page is reserved for perf_event_mmap_page. Data buffer starts on + // the next page, so we allocate one more page. + MMappedBuffer = mmap(nullptr, (kBufferPages + 1) * getpagesize(), + PROT_READ | PROT_WRITE, MAP_SHARED, FileDescriptor, 0); + if (MMappedBuffer == MAP_FAILED) { + llvm::errs() << "Failed to mmap buffer."; + } +} + +X86LbrCounter::~X86LbrCounter() { close(FileDescriptor); } + +void X86LbrCounter::start() override { + ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */); +} + +int64_t X86LbrCounter::read() const override { + auto error = readOrError(); + if (!error) { + return error.get(); + } + llvm::errs() << "Error reading counter: " << error.takeError() << "\n"; + return 0; +} + +static bool CheckLbrFormats(const struct perf_event_mmap_page &Page) { + static const uint64_t mask = 0x0000003C; + // We are looking for the format that contains "cycles". + // (Intel SDM, vol 3B 17.4.8.1) + // "MSR IA32_PERF_CAPABILITIES[5:0]" == 000110B + + // perf_event_mmap_page::capabilities is a union of {unit64_t, + // or a struct with 64 bits}. But you're allowed to read common prefix + // regardless of which union member is active. + // So it should be safe to just access the `unit64_t capabilitites` + // member. + return (Page.capabilities & mask) == 0x00000006; +} + +llvm::Expected X86LbrCounter::readOrError() const override { + // The max number of time-outs/retries before we give up. + static constexpr int kMaxTimeouts = 160; + + // Parses the LBR buffer and fills CycleArray with the sequence of cycle + // counts from the buffer. + std::vector CycleArray; + std::unique_ptr DataBuf(new char[kDataBufferSize]); + int NumTimeouts = 0; + int PollResult = 0; + while (PollResult <= 0) { + PollResult = pollLbrPerfEvent(FileDescriptor); + if (PollResult == -1) { + return llvm::make_error("Cannot poll LBR perf event.", + llvm::errc::io_error); + } else if (PollResult == 0) { + llvm::errs() << "LBR polling timed out without result, NumTimeouts =" + << NumTimeouts << ". "; + if (NumTimeouts < kMaxTimeouts) { + llvm::errs() << "Retrying ...\n"; + ++NumTimeouts; + continue; + } else { + llvm::errs() << "At max-timeouts. Giving up.\n"; + return llvm::make_error( + "LBR polling still timed out after max number of attempts.", + llvm::errc::device_or_resource_busy); + } + } + } + + struct perf_event_mmap_page Page; + memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page)); + // Check to see if the LBR format is expected. + // FIXME: It would be nicer if we could detect the format way earlier. + if (!checkLbrFormat(Page)) { + return llvm::make_error("Unexpected LBR format", + llvm::errc::not_supported); + } + + const uint64_t DataTail = Page.data_tail; + const uint64_t DataHead = Page.data_head; + // We're supposed to use a barrier after reading data_head. + std::atomic_thread_fence(std::memory_order_acq_rel); + const size_t DataSize = DataHead - DataTail; + if (DataSize > kDataBufferSize) { + return llvm::make_error( + "DataSize larger than buffer size.", llvm::errc::invalid_argument); + } + copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize); + llvm::Error error = parseDataBuffer(DataBuf.get(), DataSize, &CycleArray); + if (!error) { + // FIXME We should report the cycles count for all jumps, + // not just the most recent. + return CycleArray[0]; + } + + return error; +} + +} // namespace exegesis +} // namespace llvm + +#endif // HAVE_LIBPFM +#endif // __linux__