diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst --- a/llvm/docs/CommandGuide/llvm-exegesis.rst +++ b/llvm/docs/CommandGuide/llvm-exegesis.rst @@ -190,11 +190,19 @@ Specify the custom code snippet to measure. See example 2 for details. Either `opcode-index`, `opcode-name` or `snippets-file` must be set. -.. option:: -mode=[latency|uops|inverse_throughput|analysis] +.. option:: -mode=[latency|lbr_latency|uops|inverse_throughput|analysis] + + Specify the run mode. Note that some modes have additional requirements. + + `lbr_latency` mode makes use of LBR, which, starting with Skylake, contains the + precise number of cycles between the two consecutive branches. This will be + significantly more precise than the method using RDTSC. This mode should be run + with at least `Haswell`, but preferably `Skylake` for more precise measurements. + Using `lrb_latency` requires setting `lbr-sample-period`. + + In `analysis` mode, you also need to specify at least one of the + `-analysis-clusters-output-file=` and `-analysis-inconsistencies-output-file=`. - Specify the run mode. Note that if you pick `analysis` mode, you also need - to specify at least one of the `-analysis-clusters-output-file=` and - `-analysis-inconsistencies-output-file=`. .. option:: -repetition-mode=[duplicate|loop|min] diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h --- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h @@ -59,7 +59,15 @@ // The result of an instruction benchmark. struct InstructionBenchmark { InstructionBenchmarkKey Key; - enum ModeE { Unknown, Latency, Uops, InverseThroughput }; + enum ModeE { + Unknown, + Latency, + Uops, + InverseThroughput, + // LbrLatency mode is only available on x86. + // Must be used with at least, Haswell. + LbrLatency, + }; ModeE Mode; std::string CpuName; std::string LLVMTriple; diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h @@ -20,6 +20,7 @@ #include "BenchmarkResult.h" #include "LlvmState.h" #include "MCInstrDescView.h" +#include "PerfHelper.h" #include "SnippetRepetitor.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/Error.h" @@ -33,16 +34,26 @@ // Common code for all benchmark modes. class BenchmarkRunner { public: + struct RunArg { + const BenchmarkCode &CodeSnippet; + unsigned NumRepetitions; + ArrayRef> Repetitors; + bool DumpObjectToDisk; + unsigned LbrSamplePeriod; + }; explicit BenchmarkRunner(const LLVMState &State, InstructionBenchmark::ModeE Mode); virtual ~BenchmarkRunner(); + // TODO(vyng) Make this "Deprecated" and switch caller to using RunArg Expected runConfiguration(const BenchmarkCode &Configuration, unsigned NumRepetitions, ArrayRef> Repetitors, bool DumpObjectToDisk) const; + Expected runConfiguration(const RunArg &Arg) const; + // Scratch space to run instructions that touch memory. struct ScratchSpace { static constexpr const size_t kAlignment = 1024; diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -7,9 +7,13 @@ //===----------------------------------------------------------------------===// #include +#include +#include #include +#include #include "Assembler.h" +#include "BenchmarkResult.h" #include "BenchmarkRunner.h" #include "Error.h" #include "MCInstrDescView.h" @@ -19,6 +23,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/CrashRecoveryContext.h" +#include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Program.h" @@ -78,23 +83,119 @@ const ExecutableFunction Function; BenchmarkRunner::ScratchSpace *const Scratch; }; + +class LbrFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { +public: + LbrFunctionExecutorImpl( + const LLVMState &State, + llvm::object::OwningBinary Obj, + BenchmarkRunner::ScratchSpace *Scratch, unsigned LbrSamplePeriod) + : Function(State.createTargetMachine(), std::move(Obj)), Scratch(Scratch), + LbrSamplePeriod(LbrSamplePeriod) {} + +private: + struct ThreadArg { + std::mutex Mutex; + pfm::Counter *CounterPtr; + std::condition_variable ConditionVariable; + bool Crashed; + }; + + void workerThread(ThreadArg *Arg) const { + pfm::PerfEvent PerfEvent("LBR", LbrSamplePeriod); + pfm::Counter Counter(PerfEvent); + { + std::lock_guard Lock(Arg->Mutex); + Arg->CounterPtr = &Counter; + } + Arg->ConditionVariable.notify_one(); + { + llvm::CrashRecoveryContext CRC; + llvm::CrashRecoveryContext::Enable(); + const bool Crashed = !CRC.RunSafely([&]() { + Counter.start(); + Function(Scratch->ptr()); + Counter.stop(); + }); + llvm::CrashRecoveryContext::Disable(); + { + std::lock_guard Lock(Arg->Mutex); + Arg->CounterPtr = nullptr; + } + // FIXME: Better diagnosis. + if (Crashed) { + Arg->Crashed = true; + return; + } + } + Arg->Crashed = false; + } + + llvm::Expected runAndMeasure(const char *) const override { + Scratch->clear(); + struct ThreadArg Arg; + Arg.CounterPtr = nullptr; + std::thread Worker(&LbrFunctionExecutorImpl::workerThread, this, &Arg); + { + // Waits until the counter is ready. + std::unique_lock Lock(Arg.Mutex); + Arg.ConditionVariable.wait(Lock, + [&Arg] { return Arg.CounterPtr != nullptr; }); + } + if (Arg.Crashed) { + return make_error( + "Snippet crashed while running in thread."); + } + + auto valueOrError = Arg.CounterPtr->readOrError(); + Worker.join(); + if (valueOrError) { + return valueOrError.takeError(); + } + return valueOrError.get(); + } + + const ExecutableFunction Function; + BenchmarkRunner::ScratchSpace *const Scratch; + unsigned LbrSamplePeriod; +}; + +std::unique_ptr +createFunctionExecutor(InstructionBenchmark::ModeE Mode, const LLVMState &State, + object::OwningBinary ObjectFile, + BenchmarkRunner::ScratchSpace *Scratch, + const BenchmarkRunner::RunArg &Arg) { + if (Mode == InstructionBenchmark::ModeE::LbrLatency) { + return std::make_unique( + State, std::move(ObjectFile), Scratch, Arg.LbrSamplePeriod); + } + return std::make_unique(State, std::move(ObjectFile), + Scratch); +} + } // namespace Expected BenchmarkRunner::runConfiguration( const BenchmarkCode &BC, unsigned NumRepetitions, ArrayRef> Repetitors, bool DumpObjectToDisk) const { + RunArg arg{BC, NumPrepetitions, Repetitors, DumObjectToDisk}; + return runConfiguration(arg); +} + +Expected +BenchmarkRunner::runConfiguration(const RunArg &Arg) const { InstructionBenchmark InstrBenchmark; InstrBenchmark.Mode = Mode; InstrBenchmark.CpuName = std::string(State.getTargetMachine().getTargetCPU()); InstrBenchmark.LLVMTriple = State.getTargetMachine().getTargetTriple().normalize(); - InstrBenchmark.NumRepetitions = NumRepetitions; - InstrBenchmark.Info = BC.Info; + InstrBenchmark.NumRepetitions = Arg.NumRepetitions; + InstrBenchmark.Info = Arg.CodeSnippet.Info; - const std::vector &Instructions = BC.Key.Instructions; + const std::vector &Instructions = Arg.CodeSnippet.Key.Instructions; - InstrBenchmark.Key = BC.Key; + InstrBenchmark.Key = Arg.CodeSnippet.Key; // If we end up having an error, and we've previously succeeded with // some other Repetitor, we want to discard the previous measurements. @@ -112,7 +213,8 @@ }; ClearBenchmarkOnReturn CBOR(&InstrBenchmark); - for (const std::unique_ptr &Repetitor : Repetitors) { + for (const std::unique_ptr &Repetitor : + Arg.Repetitors) { // Assemble at least kMinInstructionsForSnippet instructions by repeating // the snippet for debug/analysis. This is so that the user clearly // understands that the inside instructions are repeated. @@ -122,11 +224,13 @@ raw_svector_ostream OS(Buffer); if (Error E = assembleToStream( State.getExegesisTarget(), State.createTargetMachine(), - BC.LiveIns, BC.Key.RegisterInitialValues, + Arg.CodeSnippet.LiveIns, + Arg.CodeSnippet.Key.RegisterInitialValues, Repetitor->Repeat(Instructions, kMinInstructionsForSnippet), OS)) { return std::move(E); } + const ExecutableFunction EF(State.createTargetMachine(), getObjectFromBuffer(OS.str())); const auto FnBytes = EF.getFunctionBytes(); @@ -155,15 +259,18 @@ raw_svector_ostream OS(Buffer); if (Error E = assembleToStream( State.getExegesisTarget(), State.createTargetMachine(), - BC.LiveIns, BC.Key.RegisterInitialValues, Filler, OS)) { + Arg.CodeSnippet.LiveIns, + Arg.CodeSnippet.Key.RegisterInitialValues, Filler, OS)) { return std::move(E); } ObjectFile = getObjectFromBuffer(OS.str()); } - const FunctionExecutorImpl Executor(State, std::move(ObjectFile), - Scratch.get()); - auto NewMeasurements = runMeasurements(Executor); + std::unique_ptr Executor = + createFunctionExecutor(Mode, State, std::move(ObjectFile), + Scratch.get(), Arg); + + auto NewMeasurements = runMeasurements(*Executor); if (Error E = NewMeasurements.takeError()) { if (!E.isA()) return std::move(E); diff --git a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h --- a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h +++ b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h @@ -29,6 +29,17 @@ Expected> runMeasurements(const FunctionExecutor &Executor) const override; }; + +class LbrLatencyBenchmarkRunner : public BenchmarkRunner { +public: + LbrLatencyBenchmarkRunner(const LLVMState &State); + ~LbrLatencyBenchmarkRunner() = default; + +private: + Expected> + runMeasurements(const FunctionExecutor &Executor) const override; +}; + } // namespace exegesis } // namespace llvm diff --git a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp --- a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp @@ -52,5 +52,20 @@ return std::move(Result); } +LbrLatencyBenchmarkRunner::LbrLatencyBenchmarkRunner(const LLVMState &State) + : BenchmarkRunner(State, InstructionBenchmark::LbrLatency) {} + +llvm::Expected> +LbrLatencyBenchmarkRunner::runMeasurements( + const FunctionExecutor &Executor) const { + // TODO(vyng) Maybe verify that we're running with the right CPU/config + auto ExpectedCounterValue = Executor.runAndMeasure("LBR"); + if (!ExpectedCounterValue) + return ExpectedCounterValue.takeError(); + std::vector Result = { + BenchmarkMeasure::Create("lbr-latency", *ExpectedCounterValue)}; + return std::move(Result); +} + } // namespace exegesis } // namespace llvm diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.h b/llvm/tools/llvm-exegesis/lib/PerfHelper.h --- a/llvm/tools/llvm-exegesis/lib/PerfHelper.h +++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.h @@ -17,6 +17,8 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/Config/config.h" +#include "llvm/Support/Error.h" +#include #include #include @@ -36,7 +38,7 @@ public: // http://perfmon2.sourceforge.net/manv4/libpfm.html // Events are expressed as strings. e.g. "INSTRUCTION_RETIRED" - explicit PerfEvent(StringRef pfm_event_string); + explicit PerfEvent(StringRef PfmEventString, unsigned SamplingPeriod = 0); PerfEvent(const PerfEvent &) = delete; PerfEvent(PerfEvent &&other); @@ -55,7 +57,14 @@ // e.g. "snb_ep::INSTRUCTION_RETIRED:e=0:i=0:c=0:t=0:u=1:k=0:mg=0:mh=1" StringRef getPfmEventString() const; + // Returns true if it should use LBR. + bool useLbr() const; + private: + void initPerfEvent(); + void initPerfEventForLbr(); + unsigned LbrSamplePeriod; + const std::string EventString; std::string FullQualifiedEventString; perf_event_attr *Attr; @@ -63,23 +72,32 @@ // Uses a valid PerfEvent to configure the Kernel so we can measure the // underlying event. -struct Counter { +class Counter { +public: // event: the PerfEvent to measure. explicit Counter(const PerfEvent &event); Counter(const Counter &) = delete; Counter(Counter &&other) = default; - ~Counter(); + virtual ~Counter(); - void start(); // Starts the measurement of the event. - void stop(); // Stops the measurement of the event. - int64_t read() const; // Return the current value of the counter. + /// Starts the measurement of the event. + virtual void start(); -private: + /// Stops the measurement of the event. + void stop(); + + /// Returns the current value of the counter. + virtual int64_t read() const; + + /// Returns the current value of the counter or error if it cannot be read. + virtual llvm::Expected readOrError() const { return read(); } + +protected: #ifdef HAVE_LIBPFM int FileDescriptor = -1; -#endif +#endif // HAVE_LIBPFM }; // Helper to measure a list of PerfEvent for a particular function. diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp --- a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp +++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp @@ -7,14 +7,24 @@ //===----------------------------------------------------------------------===// #include "PerfHelper.h" +#include "third_party/llvm/llvm-project/llvm/include/llvm/Support/Error.h" #include "llvm/Config/config.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Errc.h" #include "llvm/Support/raw_ostream.h" +#include #ifdef HAVE_LIBPFM #include "perfmon/perf_event.h" #include "perfmon/pfmlib.h" #include "perfmon/pfmlib_perf_event.h" #endif +#include #include +#include +#include +#include +#include +#include namespace llvm { namespace exegesis { @@ -46,14 +56,14 @@ } PerfEvent::PerfEvent(PerfEvent &&Other) - : EventString(std::move(Other.EventString)), + : LbrSamplePeriod(Other.LbrSamplePeriod), + EventString(std::move(Other.EventString)), FullQualifiedEventString(std::move(Other.FullQualifiedEventString)), Attr(Other.Attr) { Other.Attr = nullptr; } -PerfEvent::PerfEvent(StringRef PfmEventString) - : EventString(PfmEventString.str()), Attr(nullptr) { +void PerfEvent::initPerfEvent() { #ifdef HAVE_LIBPFM char *Fstr = nullptr; pfm_perf_encode_arg_t Arg = {}; @@ -77,10 +87,40 @@ #endif } +void PerfEvent::initPerfEventForLbr() { + Attr = new perf_event_attr(); + *Attr = {0}; + Attr->size = sizeof(*Attr); + Attr->type = PERF_TYPE_HARDWARE; + Attr->config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS; + Attr->sample_type = PERF_SAMPLE_BRANCH_STACK; + Attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_ANY; + Attr->sample_period = LbrSamplePeriod; + Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH. + Attr->disabled = 1; + Attr->exclude_kernel = 1; + Attr->exclude_hv = 1; + Attr->read_format = PERF_FORMAT_GROUP; + + FullQualifiedEventString = "LBR"; +} + +PerfEvent::PerfEvent(StringRef PfmEventString, unsigned SamplingPeriod) + : LbrSamplePeriod(SamplingPeriod), EventString(PfmEventString.str()), + Attr(nullptr) { + if (EventString == "LBR") { + initPerfEventForLbr(); + } else { + initPerfEvent(); + } +} + StringRef PerfEvent::name() const { return EventString; } bool PerfEvent::valid() const { return !FullQualifiedEventString.empty(); } +bool PerfEvent::useLbr() const { return EventString == "LBR"; } + const perf_event_attr *PerfEvent::attribute() const { return Attr; } StringRef PerfEvent::getPfmEventString() const { diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h --- a/llvm/tools/llvm-exegesis/lib/Target.h +++ b/llvm/tools/llvm-exegesis/lib/Target.h @@ -21,6 +21,7 @@ #include "Error.h" #include "LlvmState.h" #include "SnippetGenerator.h" +#include "third_party/llvm/llvm-project/llvm/include/llvm/Support/Error.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/CallingConv.h" @@ -65,6 +66,11 @@ explicit ExegesisTarget(ArrayRef CpuPfmCounters) : CpuPfmCounters(CpuPfmCounters) {} + virtual Expected + CreateCounter(const pfm::PerfEvent &Event) const { + return pfm::Counter(Event); + } + // Targets can use this to add target-specific passes in assembleToStream(); virtual void addTargetSpecificPasses(PassManagerBase &PM) const {} @@ -173,6 +179,8 @@ const LLVMState &State, InstructionBenchmark::ModeE Mode) const; std::unique_ptr virtual createUopsBenchmarkRunner( const LLVMState &State) const; + std::unique_ptr virtual createLbrLatencyBenchmarkRunner( + const LLVMState &State) const; const ExegesisTarget *Next = nullptr; const ArrayRef CpuPfmCounters; diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp --- a/llvm/tools/llvm-exegesis/lib/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/Target.cpp @@ -49,6 +49,8 @@ case InstructionBenchmark::Uops: case InstructionBenchmark::InverseThroughput: return createParallelSnippetGenerator(State, Opts); + case InstructionBenchmark::LbrLatency: + return createSerialSnippetGenerator(State, Opts); } return nullptr; } @@ -77,6 +79,8 @@ return make_error("can't run 'uops' mode, sched model does not " "define uops or issue counters."); return createUopsBenchmarkRunner(State); + case InstructionBenchmark::LbrLatency: + return createLbrLatencyBenchmarkRunner(State); } return nullptr; } @@ -101,6 +105,16 @@ return std::make_unique(State); } +std::unique_ptr +ExegesisTarget::createLbrLatencyBenchmarkRunner(const LLVMState &State) const { +#if defined(__x86_64__) || defined(__x86__) || defined(__i386__) + return std::make_unique(State); +#else +#warning "LBR benchmark runner being created for non-X86 arch." +#endif + return nullptr; +} + static_assert(std::is_pod::value, "We shouldn't have dynamic initialization here"); const PfmCountersInfo PfmCountersInfo::Default = {nullptr, nullptr, nullptr, diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp --- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp @@ -18,7 +18,23 @@ #include "X86Subtarget.h" #include "llvm/ADT/Sequence.h" #include "llvm/MC/MCInstBuilder.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" +#include +#include +#ifdef HAVE_LIBPFM +#include "perfmon/perf_event.h" +#include "perfmon/pfmlib.h" +#include "perfmon/pfmlib_perf_event.h" +#endif +#include +#include +#include +#include +#include +#include +#include namespace llvm { namespace exegesis { @@ -559,10 +575,189 @@ #include "X86GenExegesis.inc" namespace { +#ifdef HAVE_LIBPFM +class X86Counter : public pfm::Counter { +public: + explicit X86Counter(const pfm::PerfEvent &Event) : Counter(Event) { + if (Event.useLbr()) { + // First page is reserved for perf_event_mmap_page. Data buffer starts on + // the next page, so we allocate one more page. + MMappedBuffer = + mmap(nullptr, (kBufferPages + 1) * getpagesize(), + PROT_READ | PROT_WRITE, MAP_SHARED, FileDescriptor, 0); + if (MMappedBuffer == MAP_FAILED) { + llvm::errs() << "Failed to mmap buffer."; + } + LbrMode = true; + } + } + virtual ~X86Counter() {} + + void start() override { + if (LbrMode) { + ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */); + } else { + ioctl(FileDescriptor, PERF_EVENT_IOC_RESET, 0); + } + } + + int64_t read() const override { + auto error = readOrError(); + if (!error) { + return error.get(); + } + llvm::errs() << "Error reading counter: " << error.takeError() << "\n"; + return 0; + } + + llvm::Expected readOrError() const override { + // Parses the LBR buffer and fills CycleArray with the sequence of cycle + // counts from the buffer. + std::vector CycleArray; + std::unique_ptr DataBuf(new char[DataBufferSize]); + size_t Pos = 0; + for (;;) { + if (pollLbrPerfEvent() == -1) { + return llvm::make_error( + "Cannot poll LBR perf event.", llvm::errc::io_error); + } + // First page is reserved for perf_event_mmap_page. Data buffer starts on + // the next page. + struct perf_event_mmap_page Page; + memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page)); + const uint64_t DataTail = Page.data_tail; + const uint64_t DataHead = Page.data_head; + // We're supposed to use a barrier after reading data_head. + std::atomic_thread_fence(std::memory_order_acq_rel); + const size_t DataSize = DataHead - DataTail; + if (DataSize > DataBufferSize) { + return llvm::make_error( + "DataSize larger than buffer size.", llvm::errc::invalid_argument); + } + copyDataBuffer(DataBuf.get(), DataTail, DataSize); + llvm::Error error = + readDataBuffer(DataBuf.get(), DataSize, &CycleArray, Pos); + if (!error) { + // TODO(vyng) Analyse the array and get proper value. + return CycleArray[0]; + } + } + return llvm::make_error("Unknown error.", + llvm::errc::io_error); + } + +private: + // Waits for the LBR perf events. + int pollLbrPerfEvent() const { + struct pollfd PollFd; + PollFd.fd = FileDescriptor; + PollFd.events = POLLIN; + PollFd.revents = 0; + return poll(&PollFd, 1 /* num of fds */, 10000 /* time out */); + } + + void copyDataBuffer(char *Buf, uint64_t Tail, size_t DataSize) const { + // First page is reserved for perf_event_mmap_page. Data buffer starts on + // the next page. + char *Start = reinterpret_cast(MMappedBuffer) + getpagesize(); + // The LBR buffer is a cyclic buffer, we copy data to another buffer. + uint64_t Offset = Tail % DataBufferSize; + size_t CopySize = DataBufferSize - Offset; + memcpy(Buf, Start + Offset, CopySize); + if (CopySize >= DataSize) + return; + memcpy(Buf + CopySize, Start, Offset); + } + + llvm::Error readDataBuffer(const char *DataBuf, size_t DataSize, + std::vector *CycleArray, + size_t Pos) const { + const char *DataPtr = DataBuf; + while (DataPtr < DataBuf + DataSize) { + struct perf_event_header Header; + memcpy(&Header, DataPtr, sizeof(struct perf_event_header)); + if (Header.type != PERF_RECORD_SAMPLE) { + // Ignores non-sample records. + DataPtr += Header.size; + continue; + } + DataPtr += sizeof(Header); + uint64_t Count = llvm::support::endian::read64(DataPtr, support::native); + DataPtr += sizeof(Count); + + struct perf_branch_entry Entry; + memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); + // Read the perf_branch_entry array. + char *JumpPC = reinterpret_cast(Entry.from); + int64_t MinCycle = Entry.cycles; + for (int i = 0; i < Count; ++i) { + // We use the JumpPC from the entry with min cycle to avoid the entry + // that is returning from kernel. + if (MinCycle > Entry.cycles) { + MinCycle = Entry.cycles; + JumpPC = reinterpret_cast(Entry.from); + } + (*CycleArray)[Pos++] = Entry.cycles; + if (Pos == CycleArray->size()) { + patchBasicBlockToEndBenchmarkedLoop(JumpPC); + return llvm::Error::success(); + } + // Advance to next entry + DataPtr += sizeof(Entry); + memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); + } + } + return llvm::make_error("Unable to read databuffer.", + llvm::errc::io_error); + } + + void patchBasicBlockToEndBenchmarkedLoop(char *pc) const { + auto prot = PROT_READ | PROT_WRITE | PROT_EXEC; + auto page = reinterpret_cast(reinterpret_cast(pc) & + ~(getpagesize() - 1)); + mprotect(page, getpagesize(), prot); + // Update the last jump back instruction, pc is the start of jump + // instruction. + if ((reinterpret_cast(pc) & 0xf) != 0xf) { + // We have at least two bytes in the same cacheline, so we can use an + // atomic write to replace the jmp with pop, ret. + *reinterpret_cast(pc) = 0xc35b; // pop %rbx; ret; + } else { + // pc and pc+1 might be at different cacheline, so we first add the pop, + // ret instructions, and then update jump offset to jump to pop + // instruction. + *reinterpret_cast(pc + 5) = 0xc35b; // pop %rbx; ret; + // An automic update on the displacement. + if (pc[0] == 0xeb /* x86 short jump */) { + // offset is 1 byte + pc[1] = 0x03; + } else { /* x86 near jump */ + // offset is 4 bytes + *reinterpret_cast(pc + 1) = 0; + } + } + } + + bool LbrMode = false; + void *MMappedBuffer = nullptr; + size_t DataBufferSize; + static constexpr int kBufferPages = 8; +}; +#endif + class ExegesisX86Target : public ExegesisTarget { public: ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {} + Expected + CreateCounter(const pfm::PerfEvent &Event) const override { +#ifdef HAVE_LIBPFM + return X86Counter(Event); +#else + return pfm::Counter(Event); +#endif + } + private: void addTargetSpecificPasses(PassManagerBase &PM) const override; diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -73,6 +73,8 @@ "mode", cl::desc("the mode to run"), cl::cat(Options), cl::values(clEnumValN(exegesis::InstructionBenchmark::Latency, "latency", "Instruction Latency"), + clEnumValN(exegesis::InstructionBenchmark::LbrLatency, + "lbr_latency", "Instruction Latency using LBR"), clEnumValN(exegesis::InstructionBenchmark::InverseThroughput, "inverse_throughput", "Instruction Inverse Throughput"), @@ -83,6 +85,11 @@ clEnumValN(exegesis::InstructionBenchmark::Unknown, "analysis", "Analysis"))); +static cl::opt + LbrSamplePeriod("lbr-sample-period", + cl::desc("The sample period in msec used for LBR sampling"), + cl::cat(BenchmarkOptions), cl::init(10000)); + static cl::opt RepetitionMode( "repetition-mode", cl::desc("how to repeat the instruction snippet"), cl::cat(BenchmarkOptions), @@ -332,6 +339,13 @@ Configurations = ExitOnErr(readSnippets(State, SnippetsFile)); } +#if !defined(__x86_64__) && !defined(__x86__) && !defined(__i386__) + if (exegesis::BenchmarkMode == exegesis::InstructionBenchmark::LbrLatency) { + ExitOnErr.setBanner("llvm-exegesis: "); + ExitWithError("LBR mode must be run on x86 arch."); + } +#endif + if (NumRepetitions == 0) { ExitOnErr.setBanner("llvm-exegesis: "); ExitWithError("--num-repetitions must be greater than zero"); @@ -342,8 +356,10 @@ BenchmarkFile = "-"; for (const BenchmarkCode &Conf : Configurations) { - InstructionBenchmark Result = ExitOnErr(Runner->runConfiguration( - Conf, NumRepetitions, Repetitors, DumpObjectToDisk)); + BenchmarkRunner::RunArg RunArg{Conf, NumRepetitions, Repetitors, + DumpObjectToDisk, LbrSamplePeriod}; + + InstructionBenchmark Result = ExitOnErr(Runner->runConfiguration(RunArg)); ExitOnFileError(BenchmarkFile, Result.writeYaml(State, BenchmarkFile)); } exegesis::pfm::pfmTerminate();