diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst --- a/llvm/docs/CommandGuide/llvm-exegesis.rst +++ b/llvm/docs/CommandGuide/llvm-exegesis.rst @@ -192,10 +192,24 @@ .. option:: -mode=[latency|uops|inverse_throughput|analysis] - Specify the run mode. Note that if you pick `analysis` mode, you also need - to specify at least one of the `-analysis-clusters-output-file=` and - `-analysis-inconsistencies-output-file=`. + Specify the run mode. Note that some modes have additional requirements and options. + `latency` mode can be make use of either RDTSC or LBR. + `latency[LBR]` is only available on X86 (at least `Skylake`). + To run in this mode, a positive value must be specified for `x86-lbr-sample-period` and `--repetition-mode=loop` + + In `analysis` mode, you also need to specify at least one of the + `-analysis-clusters-output-file=` and `-analysis-inconsistencies-output-file=`. + +.. option:: -x86-lbr-sample-period= + + Specify the LBR sampling period - how many branches before we take a sample. + When a positive value is specified for this option and when the mode is `latency`, + we will use LBRs for measuring. + On choosing the "right" sampling period, a small value is preferred, but throttling + could occur if the sampling is too frequent. A prime number should be used to + avoid consistently skipping certain blocks. + .. option:: -repetition-mode=[duplicate|loop|min] Specify the repetition mode. `duplicate` will create a large, straight line diff --git a/llvm/test/tools/llvm-exegesis/X86/lbr/Inputs/mov_add.att b/llvm/test/tools/llvm-exegesis/X86/lbr/Inputs/mov_add.att new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/lbr/Inputs/mov_add.att @@ -0,0 +1,4 @@ +# LLVM-EXEGESIS-LIVEIN RDI +# LLVM-EXEGESIS-DEFREG XMM1 42 +movq $2, %rdi +addq $0x10, %rdi \ No newline at end of file diff --git a/llvm/test/tools/llvm-exegesis/X86/lbr/lit.local.cfg b/llvm/test/tools/llvm-exegesis/X86/lbr/lit.local.cfg new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/lbr/lit.local.cfg @@ -0,0 +1,31 @@ +import subprocess +import lit.util + +if not ('X86' in config.root.targets): + # We need support for X86. + config.unsupported = True + +elif not ('x86_64' in config.root.host_triple): + # We need to be running on an X86 host. + config.unsupported = True + +else: + # We need libpfm to be installed and the host to be at least skylake. + llvm_exegesis_exe = lit.util.which('llvm-exegesis', config.llvm_tools_dir) + if not llvm_exegesis_exe: + print('llvm-exegesis not found') + config.unsupported = True + else: + try: + with open(os.devnull, 'w') as quiet: + check_llvm_exegesis_uops_result = subprocess.call( + [llvm_exegesis_exe, '-expected-host-cpu', 'lake', '-mode', 'uops', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet) + check_llvm_exegesis_latency_result = subprocess.call( + [llvm_exegesis_exe, '-expected-host-cpu', 'lake', '-mode', 'latency', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet) + except OSError: + print('could not exec llvm-exegesis') + config.unsupported = True + if not check_llvm_exegesis_uops_result == 0: + config.unsupported = True + if not check_llvm_exegesis_latency_result == 0: + config.unsupported = True diff --git a/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s b/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s @@ -0,0 +1,18 @@ +# RUN: llvm-exegesis -mode=latency --repetition-mode=loop --x86-lbr-sample-period=521 --snippets-file=%p/Inputs/mov_add.att + + +CHECK: --- +CHECK-NEXT: mode: latency +CHECK-NEXT: key: +CHECK-NEXT: instructions: +CHECK-NEXT: 'MOV64ri32 RDI i_0x2' +CHECK-NEXT: 'ADD64ri8 RDI RDI i_0x10' +CHECK-NEXT: config: '' +CHECK-NEXT: {{.*}} +CHECK-NEXT: {{.*}} +CHECK-NEXT: {{.*}} +CHECK-NEXT: {{.*}} +CHECK-NEXT: num_repetitions: 10000 +CHECK-NEXT: measurements: +CHECK-NEXT: {{.*}} value: 0.0001, per_snippet_value: 0.0002 {{.*}} +CHECK-LAST: ... diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.h b/llvm/tools/llvm-exegesis/lib/PerfHelper.h --- a/llvm/tools/llvm-exegesis/lib/PerfHelper.h +++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.h @@ -57,8 +57,9 @@ // e.g. "snb_ep::INSTRUCTION_RETIRED:e=0:i=0:c=0:t=0:u=1:k=0:mg=0:mh=1" StringRef getPfmEventString() const; -private: - const std::string EventString; +protected: + PerfEvent() = default; + std::string EventString; std::string FullQualifiedEventString; perf_event_attr *Attr; }; @@ -87,7 +88,7 @@ /// Returns the current value of the counter or error if it cannot be read. virtual llvm::Expected readOrError() const; -private: +protected: PerfEvent Event; #ifdef HAVE_LIBPFM int FileDescriptor = -1; diff --git a/llvm/tools/llvm-exegesis/lib/X86/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/X86/CMakeLists.txt --- a/llvm/tools/llvm-exegesis/lib/X86/CMakeLists.txt +++ b/llvm/tools/llvm-exegesis/lib/X86/CMakeLists.txt @@ -6,6 +6,7 @@ add_library(LLVMExegesisX86 STATIC Target.cpp + X86Counter.cpp ) llvm_update_compile_flags(LLVMExegesisX86) diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp --- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp @@ -14,15 +14,40 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86MCTargetDesc.h" #include "X86.h" +#include "X86Counter.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Sequence.h" #include "llvm/MC/MCInstBuilder.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" +#include +#include +#include + namespace llvm { namespace exegesis { +static cl::OptionCategory + BenchmarkOptions("llvm-exegesis benchmark x86-options"); + +// If a positive value is specified, we are going to use the LBR in +// latency-mode. +// +// Note: +// - A small value is preferred, but too low a value could result in +// throttling. +// - A prime number is preferred to avoid always skipping certain blocks. +// +static cl::opt LbrSamplingPeriod( + "x86-lbr-sample-period", + cl::desc("The sample period (nbranches/sample), used for LBR sampling"), + cl::cat(BenchmarkOptions), cl::init(0)); + +// FIXME: Validates that repetition-mode is loop if LBR is requested. + // Returns a non-null reason if we cannot handle the memory references in this // instruction. static const char *isInvalidMemoryInstr(const Instruction &Instr) { @@ -559,10 +584,29 @@ #include "X86GenExegesis.inc" namespace { + class ExegesisX86Target : public ExegesisTarget { public: ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {} + Expected> + createCounter(StringRef CounterName, const LLVMState &State) const override { + // If LbrSamplingPeriod was provided, then ignore the + // CounterName because we only have one for LBR. + if (LbrSamplingPeriod > 0) { + // Can't use LBR without HAVE_LIBPFM, or __linux__ (for now) +#if defined(HAVE_LIBPFM) && defined(__linux__) + return std::make_unique( + X86LbrPerfEvent(LbrSamplingPeriod)); +#else + return llvm::make_error( + "LBR counter requested without HAVE_LIBPFM or running on Linux.", + llvm::errc::invalid_argument); +#endif + } + return ExegesisTarget::createCounter(CounterName, State); + } + private: void addTargetSpecificPasses(PassManagerBase &PM) const override; diff --git a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h new file mode 100644 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h @@ -0,0 +1,45 @@ +//===-- X86Counter.h --------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_EXEGESIS_X86COUNTER_H +#define LLVM_TOOLS_LLVM_EXEGESIS_X86COUNTER_H + +#include "../PerfHelper.h" +#include "llvm/Support/Error.h" + +// FIXME: Use appropriate wrappers for poll.h and mman.h +// to support Windows and remove this linux-only guard. +#if defined(__linux__) && defined(HAVE_LIBPFM) + +namespace llvm { +namespace exegesis { + +class X86LbrPerfEvent : public pfm::PerfEvent { +public: + X86LbrPerfEvent(unsigned SamplingPeriod); +}; + +class X86LbrCounter : public pfm::Counter { +public: + explicit X86LbrCounter(pfm::PerfEvent &&Event); + + virtual ~X86LbrCounter(); + + void start() override; + llvm::Expected readOrError() const override; + +private: + void *MMappedBuffer = nullptr; +}; + +} // namespace exegesis +} // namespace llvm + +#endif // defined(__linux__) && defined(HAVE_LIBPFM) + +#endif // LLVM_TOOLS_LLVM_EXEGESIS_X86COUNTER_H diff --git a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp new file mode 100644 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp @@ -0,0 +1,221 @@ +//===-- X86Counter.cpp ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "X86Counter.h" + +// FIXME: Use appropriate wrappers for poll.h and mman.h +// to support Windows and remove this linux-only guard. +#ifdef __linux__ +#include "llvm/Support/Endian.h" +#include "llvm/Support/Errc.h" + +#ifdef HAVE_LIBPFM +#include "perfmon/perf_event.h" +#include "perfmon/pfmlib.h" +#include "perfmon/pfmlib_perf_event.h" +#endif // HAVE_LIBPFM + +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef HAVE_LIBPFM +namespace llvm { +namespace exegesis { + +static constexpr size_t kBufferPages = 8; +static const size_t kDataBufferSize = kBufferPages * getpagesize(); + +// Waits for the LBR perf events. +static int pollLbrPerfEvent(const int FileDescriptor) { + struct pollfd PollFd; + PollFd.fd = FileDescriptor; + PollFd.events = POLLIN; + PollFd.revents = 0; + return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */); +} + +// Copies the data-buffer into Buf, given the pointer to MMapped. +static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail, + size_t DataSize) { + // First page is reserved for perf_event_mmap_page. Data buffer starts on + // the next page. + char *Start = reinterpret_cast(MMappedBuffer) + getpagesize(); + // The LBR buffer is a cyclic buffer, we copy data to another buffer. + uint64_t Offset = Tail % kDataBufferSize; + size_t CopySize = kDataBufferSize - Offset; + memcpy(Buf, Start + Offset, CopySize); + if (CopySize >= DataSize) + return; + + memcpy(Buf + CopySize, Start, Offset); + return; +} + +// Parses the given data-buffer for stats and fill the CycleArray. +// If data has been extracted successfully, also modifies the code to jump +// out the benchmark loop. +static llvm::Error parseDataBuffer(const char *DataBuf, size_t DataSize, + std::vector *CycleArray) { + const char *DataPtr = DataBuf; + while (DataPtr < DataBuf + DataSize) { + struct perf_event_header Header; + memcpy(&Header, DataPtr, sizeof(struct perf_event_header)); + if (Header.type != PERF_RECORD_SAMPLE) { + // Ignores non-sample records. + DataPtr += Header.size; + continue; + } + DataPtr += sizeof(Header); + uint64_t Count = llvm::support::endian::read64(DataPtr, support::native); + DataPtr += sizeof(Count); + + struct perf_branch_entry Entry; + memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); + // Read the perf_branch_entry array. + for (uint64_t i = 0; i < Count; ++i) { + CycleArray->push_back(Entry.cycles); + if (i == Count - 1) + // We've reached the last entry. + return llvm::Error::success(); + + // Advance to next entry + DataPtr += sizeof(Entry); + memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); + } + } + return llvm::make_error("Unable to parse databuffer.", + llvm::errc::io_error); +} + +#ifdef HAVE_LIBPFM +X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) { + assert(SamplingPeriod > 0 && "SamplingPeriod must be positive"); + EventString = "BR_INST_RETIRED.NEAR_TAKEN"; + Attr = new perf_event_attr(); + *Attr = {0}; + Attr->size = sizeof(*Attr); + Attr->type = PERF_TYPE_RAW; + // FIXME This is SKL's encoding. Not sure if it'll change. + Attr->config = 0x20c4; // BR_INST_RETIRED.NEAR_TAKEN + Attr->sample_type = PERF_SAMPLE_BRANCH_STACK; + // Don't need to specify "USER" because we've already excluded HV and Kernel. + Attr->branch_sample_type = PERF_SAMPLE_BRANCH_ANY; + Attr->sample_period = SamplingPeriod; + Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH. + Attr->disabled = 1; + Attr->exclude_kernel = 1; + Attr->exclude_hv = 1; + Attr->read_format = PERF_FORMAT_GROUP; + + FullQualifiedEventString = EventString; +} +#else +X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) { + EventString = ""; + Attr = nullptr; +} +#endif + +X86LbrCounter::X86LbrCounter(pfm::PerfEvent &&NewEvent) + : Counter(std::move(NewEvent)) { + // First page is reserved for perf_event_mmap_page. Data buffer starts on + // the next page, so we allocate one more page. + MMappedBuffer = mmap(nullptr, (kBufferPages + 1) * getpagesize(), + PROT_READ | PROT_WRITE, MAP_SHARED, FileDescriptor, 0); + if (MMappedBuffer == MAP_FAILED) { + llvm::errs() << "Failed to mmap buffer."; + } +} + +X86LbrCounter::~X86LbrCounter() { close(FileDescriptor); } + +void X86LbrCounter::start() { + ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */); +} + +static bool CheckLbrFormat(const struct perf_event_mmap_page &Page) { + static const uint64_t mask = 0x000000000000003F; + // We are looking for the format that contains "cycles". + // (Intel SDM, vol 3B 17.4.8.1) + // "MSR IA32_PERF_CAPABILITIES[5:0]" == ***110B + + // The type of perf_event_mmap_page::capabilities is union of {unit64_t, + // and a struct with 64 bits}. But you're allowed to read common prefix + // regardless of which union member is active. + // So it should be safe to just access the `unit64_t capabilitites` + // member. + return (Page.capabilities & mask) & 6; +} + +llvm::Expected X86LbrCounter::readOrError() const { + // The max number of time-outs/retries before we give up. + static constexpr int kMaxTimeouts = 160; + + // Disable the event before reading + ioctl(FileDescriptor, PERF_EVENT_IOC_DISABLE, 0); + + // Parses the LBR buffer and fills CycleArray with the sequence of cycle + // counts from the buffer. + std::vector CycleArray; + std::unique_ptr DataBuf(new char[kDataBufferSize]); + int NumTimeouts = 0; + int PollResult = 0; + while (PollResult <= 0) { + PollResult = pollLbrPerfEvent(FileDescriptor); + if (PollResult > 0) + break; + if (PollResult == -1) + return llvm::make_error("Cannot poll LBR perf event.", + llvm::errc::io_error); + if (NumTimeouts++ >= kMaxTimeouts) + return llvm::make_error( + "LBR polling still timed out after max number of attempts.", + llvm::errc::device_or_resource_busy); + } + + struct perf_event_mmap_page Page; + memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page)); + // Check to see if the LBR format is expected. + // FIXME: It would be nicer if we could detect the format way earlier. + // (Eg., check the cap bits another at the beginning of the benchmark) + if (!CheckLbrFormat(Page)) + return llvm::make_error("Unexpected LBR format", + llvm::errc::not_supported); + + const uint64_t DataTail = Page.data_tail; + const uint64_t DataHead = Page.data_head; + // We're supposed to use a barrier after reading data_head. + std::atomic_thread_fence(std::memory_order_acq_rel); + const size_t DataSize = DataHead - DataTail; + if (DataSize > kDataBufferSize) + return llvm::make_error( + "DataSize larger than buffer size.", llvm::errc::invalid_argument); + + copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize); + + llvm::Error error = parseDataBuffer(DataBuf.get(), DataSize, &CycleArray); + if (!error) + // FIXME Rework the Counter interface to allow returning more than one reads + // because We should report the cycles count for all jumps, not just the + // most recent. + return CycleArray[0]; + return error; +} + +} // namespace exegesis +} // namespace llvm + +#endif // HAVE_LIBPFM +#endif // __linux__ diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -145,6 +145,12 @@ cl::desc(""), cl::cat(AnalysisOptions), cl::init("")); +static cl::opt + ExpectedHostCpu("expected-host-cpu", + cl::desc("if non-empty, only run the benchmark if the host " + "CPU matches the given name."), + cl::cat(Options), cl::init("")); + static cl::opt AnalysisDisplayUnstableOpcodes( "analysis-display-unstable-clusters", cl::desc("if there is more than one benchmark for an opcode, said " @@ -281,6 +287,13 @@ const LLVMState State(CpuName); + if (!ExpectedHostCpu.empty()) { + // The actual name could include variations, such as "skylake" vs + // "skylake-avx512" so we don't look for exact match. + std::string actual(State.getTargetMachine().getTargetCPU().data()); + if (actual.find(ExpectedHostCpu) == std::string::npos) + ExitWithError("Unexpected host CPU " + actual); + } const std::unique_ptr Runner = ExitOnErr( State.getExegesisTarget().createBenchmarkRunner(BenchmarkMode, State)); if (!Runner) {