diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst
--- a/llvm/docs/CommandGuide/llvm-exegesis.rst
+++ b/llvm/docs/CommandGuide/llvm-exegesis.rst
@@ -192,10 +192,24 @@
 
 .. option:: -mode=[latency|uops|inverse_throughput|analysis]
 
- Specify the run mode. Note that if you pick `analysis` mode, you also need
- to specify at least one of the `-analysis-clusters-output-file=` and
- `-analysis-inconsistencies-output-file=`.
+ Specify the run mode. Note that some modes have additional requirements and options.
 
+ `latency` mode can be  make use of either RDTSC or LBR.
+ `latency[LBR]` is only available on X86 (at least `Skylake`).
+  To run in this mode, a positive value  must be specified for `x86-lbr-sample-period`
+
+ In `analysis` mode, you also need to specify at least one of the
+ `-analysis-clusters-output-file=` and `-analysis-inconsistencies-output-file=`.
+
+.. option:: -x86-lbr-sample-period=<nBranches/sample>
+
+  Specify the LBR sampling period - how many branches before we take a sample.
+  When a positive value is specified for this option and when the mode is `latency`,
+  we will use LBRs for measuring.
+  On choosing the "right" sampling period, a small value is preferred, but throttling
+  could occur if the sampling is too frequent. A prime number should be used to
+  avoid consistently skipping certain blocks.
+  
 .. option:: -repetition-mode=[duplicate|loop|min]
 
  Specify the repetition mode. `duplicate` will create a large, straight line
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
@@ -20,6 +20,7 @@
 #include "BenchmarkResult.h"
 #include "LlvmState.h"
 #include "MCInstrDescView.h"
+#include "PerfHelper.h"
 #include "SnippetRepetitor.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Error.h"
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -7,18 +7,24 @@
 //===----------------------------------------------------------------------===//
 
 #include <array>
+#include <memory>
+#include <mutex>
 #include <string>
+#include <thread>
 
 #include "Assembler.h"
+#include "BenchmarkResult.h"
 #include "BenchmarkRunner.h"
 #include "Error.h"
 #include "MCInstrDescView.h"
 #include "PerfHelper.h"
+#include "Target.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/CrashRecoveryContext.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
@@ -78,6 +84,112 @@
   const ExecutableFunction Function;
   BenchmarkRunner::ScratchSpace *const Scratch;
 };
+
+class WorkerThreadFunctionExecutorImpl
+    : public BenchmarkRunner::FunctionExecutor {
+public:
+  WorkerThreadFunctionExecutorImpl(
+      const LLVMState &State,
+      llvm::object::OwningBinary<llvm::object::ObjectFile> Obj,
+      BenchmarkRunner::ScratchSpace *Scratch)
+      : State(State), Function(State.createTargetMachine(), std::move(Obj)),
+        Scratch(Scratch) {}
+
+  struct ThreadArg {
+    std::mutex Mutex;
+    std::unique_ptr<pfm::Counter> CounterPtr;
+    std::condition_variable ConditionVariable;
+    bool Crashed;
+    std::string ErrMsg;
+    const char *CounterName;
+  };
+
+  void workerThread(struct ThreadArg *Arg) const {
+    auto CounterOrError =
+        State.getExegesisTarget().createCounter(Arg->CounterName, State);
+    if (!CounterOrError) {
+      std::lock_guard<std::mutex> Lock(Arg->Mutex);
+      Arg->Crashed = true;
+      llvm::errs() << "Error creating counter: " << CounterOrError.takeError();
+      Arg->ErrMsg = "Cannot create counter.";
+      Arg->ConditionVariable.notify_one();
+      return;
+    }
+    {
+      std::lock_guard<std::mutex> Lock(Arg->Mutex);
+      Arg->CounterPtr.reset(CounterOrError.get().get());
+    }
+    Arg->ConditionVariable.notify_one();
+    {
+      llvm::CrashRecoveryContext CRC;
+      llvm::CrashRecoveryContext::Enable();
+      const bool Crashed = !CRC.RunSafely([&]() {
+        Arg->CounterPtr->start();
+        Function(Scratch->ptr());
+        Arg->CounterPtr->stop();
+      });
+      llvm::CrashRecoveryContext::Disable();
+      {
+        std::lock_guard<std::mutex> Lock(Arg->Mutex);
+        Arg->CounterPtr.reset(nullptr);
+      }
+      // FIXME: Better diagnosis.
+      if (Crashed) {
+        Arg->Crashed = true;
+        return;
+      }
+    }
+    Arg->Crashed = false;
+  }
+
+  llvm::Expected<int64_t>
+  runAndMeasure(const char *CounterName) const override {
+    Scratch->clear();
+    struct ThreadArg Arg;
+    Arg.CounterPtr = nullptr;
+    Arg.CounterName = CounterName;
+    std::thread Worker(&WorkerThreadFunctionExecutorImpl::workerThread, this,
+                       &Arg);
+    {
+      // Waits until the counter is ready or a "Crashed" signal if
+      // it could not be created.
+      std::unique_lock<std::mutex> Lock(Arg.Mutex);
+      Arg.ConditionVariable.wait(
+          Lock, [&Arg] { return Arg.Crashed || Arg.CounterPtr != nullptr; });
+    }
+    if (Arg.Crashed) {
+      return make_error<SnippetCrash>(
+          "Snippet crashed while running in thread. Reason: [" + Arg.ErrMsg +
+          "]");
+    }
+
+    auto ValueOrError = Arg.CounterPtr->readOrError();
+    Worker.join();
+    if (!ValueOrError) {
+      return ValueOrError.takeError();
+    }
+    return ValueOrError.get();
+  }
+
+  const LLVMState &State;
+  const ExecutableFunction Function;
+  BenchmarkRunner::ScratchSpace *const Scratch;
+};
+
+std::unique_ptr<BenchmarkRunner::FunctionExecutor>
+createFunctionExecutor(InstructionBenchmark::ModeE Mode, const LLVMState &State,
+                       object::OwningBinary<object::ObjectFile> ObjectFile,
+                       BenchmarkRunner::ScratchSpace *Scratch) {
+  const ExegesisTarget &Target = State.getExegesisTarget();
+  if (Target.useWorkerThreadForBenchmark(State, Mode)) {
+    return std::make_unique<WorkerThreadFunctionExecutorImpl>(
+        State, std::move(ObjectFile), Scratch);
+  }
+
+  return std::make_unique<FunctionExecutorImpl>(State, std::move(ObjectFile),
+                                                Scratch);
+}
+
 } // namespace
 
 Expected<InstructionBenchmark> BenchmarkRunner::runConfiguration(
@@ -127,6 +239,7 @@
               OS)) {
         return std::move(E);
       }
+
       const ExecutableFunction EF(State.createTargetMachine(),
                                   getObjectFromBuffer(OS.str()));
       const auto FnBytes = EF.getFunctionBytes();
@@ -161,9 +274,11 @@
       ObjectFile = getObjectFromBuffer(OS.str());
     }
 
-    const FunctionExecutorImpl Executor(State, std::move(ObjectFile),
-                                        Scratch.get());
-    auto NewMeasurements = runMeasurements(Executor);
+    std::unique_ptr<BenchmarkRunner::FunctionExecutor> Executor =
+        createFunctionExecutor(Mode, State, std::move(ObjectFile),
+                               Scratch.get());
+
+    auto NewMeasurements = runMeasurements(*Executor);
     if (Error E = NewMeasurements.takeError()) {
       if (!E.isA<SnippetCrash>())
         return std::move(E);
diff --git a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
--- a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
@@ -25,10 +25,24 @@
                          InstructionBenchmark::ModeE Mode);
   ~LatencyBenchmarkRunner() override;
 
+private:
+  Expected<std::vector<BenchmarkMeasure>>
+  runMeasurementsWithLbr(const FunctionExecutor &Executor) const;
+
+  Expected<std::vector<BenchmarkMeasure>>
+  runMeasurements(const FunctionExecutor &Executor) const override;
+};
+
+class LbrLatencyBenchmarkRunner : public BenchmarkRunner {
+public:
+  LbrLatencyBenchmarkRunner(const LLVMState &State);
+  ~LbrLatencyBenchmarkRunner() = default;
+
 private:
   Expected<std::vector<BenchmarkMeasure>>
   runMeasurements(const FunctionExecutor &Executor) const override;
 };
+
 } // namespace exegesis
 } // namespace llvm
 
diff --git a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
--- a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
@@ -41,7 +41,7 @@
   std::vector<BenchmarkMeasure> Result;
   switch (Mode) {
   case InstructionBenchmark::Latency:
-    Result = {BenchmarkMeasure::Create("latency", MinValue)};
+    Result = {BenchmarkMeasure::Create("latency: ", MinValue)};
     break;
   case InstructionBenchmark::InverseThroughput:
     Result = {BenchmarkMeasure::Create("inverse_throughput", MinValue)};
diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.h b/llvm/tools/llvm-exegesis/lib/PerfHelper.h
--- a/llvm/tools/llvm-exegesis/lib/PerfHelper.h
+++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.h
@@ -17,6 +17,8 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
 #include <functional>
 #include <memory>
 
@@ -36,7 +38,7 @@
 public:
   // http://perfmon2.sourceforge.net/manv4/libpfm.html
   // Events are expressed as strings. e.g. "INSTRUCTION_RETIRED"
-  explicit PerfEvent(StringRef pfm_event_string);
+  explicit PerfEvent(StringRef PfmEventString, unsigned SamplingPeriod = 0);
 
   PerfEvent(const PerfEvent &) = delete;
   PerfEvent(PerfEvent &&other);
@@ -56,6 +58,9 @@
   StringRef getPfmEventString() const;
 
 private:
+  void initPerfEvent();
+  void initPerfEventForLbr(unsigned SamplingPeriod);
+
   const std::string EventString;
   std::string FullQualifiedEventString;
   perf_event_attr *Attr;
@@ -63,24 +68,35 @@
 
 // Uses a valid PerfEvent to configure the Kernel so we can measure the
 // underlying event.
-struct Counter {
+class Counter {
+public:
   // event: the PerfEvent to measure.
   explicit Counter(PerfEvent &&event);
 
   Counter(const Counter &) = delete;
   Counter(Counter &&other) = default;
 
-  ~Counter();
+  virtual ~Counter();
 
-  void start();         // Starts the measurement of the event.
-  void stop();          // Stops the measurement of the event.
-  int64_t read() const; // Return the current value of the counter.
+  /// Starts the measurement of the event.
+  virtual void start();
 
-private:
-  PerfEvent Event;
+  /// Stops the measurement of the event.
+  void stop();
+
+  /// Returns the current value of the counter.
+  virtual int64_t read() const;
+
+  /// Returns the current value of the counter or error if it cannot be read.
+  virtual llvm::Expected<int64_t> readOrError() const { return read(); }
+
+protected:
 #ifdef HAVE_LIBPFM
   int FileDescriptor = -1;
-#endif
+#endif // HAVE_LIBPFM
+
+private:
+  PerfEvent Event;
 };
 
 } // namespace pfm
diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
--- a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
+++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
@@ -8,13 +8,23 @@
 
 #include "PerfHelper.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
 #ifdef HAVE_LIBPFM
 #include "perfmon/perf_event.h"
 #include "perfmon/pfmlib.h"
 #include "perfmon/pfmlib_perf_event.h"
 #endif
+#include <atomic>
 #include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <poll.h>
+#include <sys/mman.h>
+#include <unistd.h>
 
 namespace llvm {
 namespace exegesis {
@@ -52,8 +62,7 @@
   Other.Attr = nullptr;
 }
 
-PerfEvent::PerfEvent(StringRef PfmEventString)
-    : EventString(PfmEventString.str()), Attr(nullptr) {
+void PerfEvent::initPerfEvent() {
 #ifdef HAVE_LIBPFM
   char *Fstr = nullptr;
   pfm_perf_encode_arg_t Arg = {};
@@ -77,6 +86,36 @@
 #endif
 }
 
+void PerfEvent::initPerfEventForLbr(unsigned SamplingPeriod) {
+#ifdef HAVE_LIBPFM
+  assert(SamplingPeriod > 0 && "SamplingPeriod must be positive");
+  Attr = new perf_event_attr();
+  *Attr = {0};
+  Attr->size = sizeof(*Attr);
+  Attr->type = PERF_TYPE_HARDWARE;
+  Attr->config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS;
+  Attr->sample_type = PERF_SAMPLE_BRANCH_STACK;
+  Attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_ANY;
+  Attr->sample_period = SamplingPeriod;
+  Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH.
+  Attr->disabled = 1;
+  Attr->exclude_kernel = 1;
+  Attr->exclude_hv = 1;
+  Attr->read_format = PERF_FORMAT_GROUP;
+
+  FullQualifiedEventString = "LBR";
+#endif
+}
+
+PerfEvent::PerfEvent(StringRef PfmEventString, unsigned SamplingPeriod)
+    : EventString(PfmEventString.str()), Attr(nullptr) {
+  if (EventString == "LBR") {
+    initPerfEventForLbr(SamplingPeriod);
+  } else {
+    initPerfEvent();
+  }
+}
+
 StringRef PerfEvent::name() const { return EventString; }
 
 bool PerfEvent::valid() const { return !FullQualifiedEventString.empty(); }
diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h
--- a/llvm/tools/llvm-exegesis/lib/Target.h
+++ b/llvm/tools/llvm-exegesis/lib/Target.h
@@ -27,6 +27,7 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace exegesis {
@@ -65,6 +66,16 @@
   explicit ExegesisTarget(ArrayRef<CpuAndPfmCounters> CpuPfmCounters)
       : CpuPfmCounters(CpuPfmCounters) {}
 
+  /// Returns true if the benchmark should use a worker-thread.
+  /// For most target/benchmark-mode, it is not necessary.
+  virtual bool useWorkerThreadForBenchmark(const LLVMState &,
+                                           InstructionBenchmark::ModeE) const {
+    return false;
+  }
+
+  virtual Expected<std::unique_ptr<pfm::Counter>>
+  createCounter(const char *CounterName, const LLVMState &State) const;
+
   // Targets can use this to add target-specific passes in assembleToStream();
   virtual void addTargetSpecificPasses(PassManagerBase &PM) const {}
 
diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp
--- a/llvm/tools/llvm-exegesis/lib/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Target.cpp
@@ -27,6 +27,12 @@
   return nullptr;
 }
 
+Expected<std::unique_ptr<pfm::Counter>>
+ExegesisTarget::createCounter(const char *CounterName,
+                              const LLVMState &) const {
+  return std::make_unique<pfm::Counter>(pfm::PerfEvent(CounterName));
+}
+
 void ExegesisTarget::registerTarget(ExegesisTarget *Target) {
   if (FirstTarget == nullptr) {
     FirstTarget = Target;
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -14,15 +14,37 @@
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86.h"
+#include "X86Counter.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
 namespace llvm {
 namespace exegesis {
 
+static cl::OptionCategory BenchmarkOptions("llvm-exegesis benchmark options");
+
+// If a positive value is specified, we are going to use the LBR in
+// latency-mode.
+//
+// Note:
+//  -  A small value is preferred, but too low a value could result in
+//     throttling.
+//  -  A prime number is preferred to avoid always skipping certain blocks.
+//
+static cl::opt<unsigned> LbrSamplingPeriod(
+    "x86-lbr-sample-period",
+    cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
+    cl::cat(BenchmarkOptions), cl::init(0));
+
 // Returns a non-null reason if we cannot handle the memory references in this
 // instruction.
 static const char *isInvalidMemoryInstr(const Instruction &Instr) {
@@ -559,10 +581,53 @@
 #include "X86GenExegesis.inc"
 
 namespace {
+
+bool supportsLbr(const LLVMState &State) {
+  // FIXME: check the first 5 bits of IA32_PERF_CAPABILITIES rather than
+  // hard-coding the names. (Intel SDM, vol 3B 17.4.8.1)
+  // Specifically, "MSR IA32_PERF_CAPABILITIES[5:0]" == 000110B
+  static const std::set<std::string> SupportedCpus{"skylake", "kabylake"};
+  return SupportedCpus.find(State.getTargetMachine().getTargetCPU().str()) !=
+         SupportedCpus.end();
+}
+
 class ExegesisX86Target : public ExegesisTarget {
 public:
   ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
 
+  bool
+  useWorkerThreadForBenchmark(const LLVMState &State,
+                              InstructionBenchmark::ModeE Mode) const override {
+    // If it is Latency with LBR, should use worker-thread
+    if (Mode == InstructionBenchmark::ModeE::Latency && LbrSamplingPeriod > 0) {
+      // But first, check to make sure we *can* use LBR.
+      if (supportsLbr(State)) {
+        return true;
+      }
+      llvm::errs() << "LBR not supported on given target["
+                   << State.getTargetMachine().getTargetCPU().str() << "].\n";
+      // Falls back to the default option.
+    }
+    return false;
+  }
+
+  Expected<std::unique_ptr<pfm::Counter>>
+  createCounter(const char *CounterName,
+                const LLVMState &State) const override {
+    if (CounterName == State.getPfmCounters().CycleCounter) {
+      // Can't use LBR without HAVE_LIBPFM, or __linux__ (for now)
+#if defined(HAVE_LIBPFM) && defined(__linux__)
+      return std::make_unique<X86LbrCounter>(
+          PerfEvent("LBR", LbrSamplingPeriod));
+#else
+      return llvm::make_error<llvm::StringError>(
+          "LBR counter requested without HAVE_LIBPFM or running on Linux.",
+          llvm::errc::invalid_argument);
+#endif
+    }
+    return ExegesisTarget::createCounter(CounterName, State);
+  }
+
 private:
   void addTargetSpecificPasses(PassManagerBase &PM) const override;
 
diff --git a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h
@@ -0,0 +1,41 @@
+//===-- X86Counter.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_EXEGESIS_X86COUNTER_H
+#define LLVM_TOOLS_LLVM_EXEGESIS_X86COUNTER_H
+
+#include "../PerfHelper.h"
+#include "llvm/Support/Error.h"
+
+// FIXME: Use appropriate wrappers for poll.h and mman.h
+// to support Windows and remove this linux-only guard.
+#if defined(__linux__) && defined(HAVE_LIBPFM)
+
+namespace llvm {
+namespace exegesis {
+
+class X86LbrCounter : public pfm::Counter {
+public:
+  explicit X86LbrCounter(const pfm::PerfEvent &Event);
+
+  virtual ~X86LbrCounter();
+
+  void start() override;
+  int64_t read() const override;
+  llvm::Expected<int64_t> readOrError() const override;
+
+private:
+  void *MMappedBuffer = nullptr;
+};
+
+} // namespace exegesis
+} // namespace llvm
+
+#endif // defined(__linux__) && defined(HAVE_LIBPFM)
+
+#endif // LLVM_TOOLS_LLVM_EXEGESIS_X86COUNTER_H
diff --git a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp
@@ -0,0 +1,225 @@
+//===-- X86Counter.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Counter.h"
+
+// FIXME: Use appropriate wrappers for poll.h and mman.h
+// to support Windows and remove this linux-only guard.
+#ifdef __linux__
+#include "llvm/Support/Errc.h"
+
+#ifdef HAVE_LIBPFM
+#include "perfmon/perf_event.h"
+#include "perfmon/pfmlib.h"
+#include "perfmon/pfmlib_perf_event.h"
+#endif // HAVE_LIBPFM
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include <poll.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <vector>
+
+#ifdef HAVE_LIBPFM
+namespace llvm {
+namespace exegesis {
+
+static constexpr int KBufferPages = 8;
+static const int kDataBufferSize = kBufferPages * getpagesize();
+
+// Waits for the LBR perf events.
+static int pollLbrPerfEvent(const int FileDescriptor) {
+  struct pollfd PollFd;
+  PollFd.fd = FileDescriptor;
+  PollFd.events = POLLIN;
+  PollFd.revents = 0;
+  return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */);
+}
+
+// Copies the data-buffer into Buf, given the pointer to MMapped.
+static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail,
+                           size_t DataSize) {
+  // First page is reserved for perf_event_mmap_page. Data buffer starts on
+  // the next page.
+  char *Start = reinterpret_cast<char *>(MMappedBuffer) + getpagesize();
+  // The LBR buffer is a cyclic buffer, we copy data to another buffer.
+  uint64_t Offset = Tail % kDataBufferSize;
+  size_t CopySize = kDataBufferSize - Offset;
+  memcpy(Buf, Start + Offset, CopySize);
+  if (CopySize >= DataSize)
+    return;
+  memcpy(Buf + CopySize, Start, Offset);
+}
+
+// FIXME: Remove this hack.
+// Option #1: Make the assembler produce code with a condition var for stopping
+// the loop, which can be found statically (at fixed addr) and remove this
+// hack of modifying the code while running.
+// Option #2: Run benchmarked code in the same thread, and make use of
+// Freeze_LBRs_On_PMI
+//
+// Rewrites the jmp (from the BM loop) with `pop $rbx; ret` to get
+// out of the benchmarked code.
+//
+// pc: Must point at the start of the jmp instruction.
+static void patchBasicBlockToEndBenchmarkedLoop(char *pc) const {
+  void *page = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(pc) &
+                                        ~(getpagesize() - 1));
+  mprotect(page, getpagesize(), PROT_READ | PROT_WRITE | PROT_EXEC);
+  if ((reinterpret_cast<uint64_t>(pc) & 0xf) != 0xf) {
+    // We have at least two bytes in the same cacheline, so we can use an
+    // atomic write to replace the jmp with pop, ret.
+    *reinterpret_cast<uint16_t *>(pc) = 0xc35b; // pop %rbx; ret;
+  } else {
+    // pc and pc+1 might be at different cacheline, so we first add the pop,
+    // ret instructions, and then update jump offset to jump to pop
+    // instruction.
+    *reinterpret_cast<uint16_t *>(pc + 5) = 0xc35b; // pop %rbx; ret;
+    // An automic update on the displacement.
+    if (pc[0] == 0xeb /* x86 short jump */) {
+      // offset is 1 byte
+      pc[1] = 0x03;
+    } else { /* x86 near jump */
+      // offset is 4 bytes
+      *reinterpret_cast<uint32_t *>(pc + 1) = 0;
+    }
+  }
+}
+
+// Parses the given data-buffer for stats and fill the CycleArray.
+// If data has been extracted successfully, also modifies the code to jump
+// out the benchmark loop.
+static llvm::Error parseDataBuffer(const char const *DataBuf, size_t DataSize,
+                                   std::vector<int64_t> *CycleArray) const {
+  const char *DataPtr = DataBuf;
+  while (DataPtr < DataBuf + DataSize) {
+    struct perf_event_header Header;
+    memcpy(&Header, DataPtr, sizeof(struct perf_event_header));
+    if (Header.type != PERF_RECORD_SAMPLE) {
+      // Ignores non-sample records.
+      DataPtr += Header.size;
+      continue;
+    }
+    DataPtr += sizeof(Header);
+    uint64_t Count = llvm::support::endian::read64(DataPtr, support::native);
+    DataPtr += sizeof(Count);
+
+    struct perf_branch_entry Entry;
+    memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
+    // Read the perf_branch_entry array.
+    char *JumpPC = reinterpret_cast<char *>(Entry.from);
+    int64_t MinCycle = Entry.cycles;
+    for (int i = 0; i < Count; ++i) {
+      // We use the JumpPC from the entry with min cycle to avoid the entry
+      // that is returning from kernel.
+      if (MinCycle > Entry.cycles) {
+        MinCycle = Entry.cycles;
+        JumpPC = reinterpret_cast<char *>(Entry.from);
+      }
+      CycleArray->push_back(Entry.cycles);
+      if (i == Count - 1) {
+        // If we're at the last entry, then terminate the loop.
+        patchBasicBlockToEndBenchmarkedLoop(JumpPC);
+        return llvm::Error::success();
+      }
+      // Advance to next entry
+      DataPtr += sizeof(Entry);
+      memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
+    }
+  }
+  return llvm::make_error<llvm::StringError>("Unable to read databuffer.",
+                                             llvm::errc::io_error);
+}
+
+X86LbrCounter::X86LbrCounter(const pfm::PerfEvent &Event) : Counter(Event) {
+  // First page is reserved for perf_event_mmap_page. Data buffer starts on
+  // the next page, so we allocate one more page.
+  MMappedBuffer = mmap(nullptr, (kBufferPages + 1) * getpagesize(),
+                       PROT_READ | PROT_WRITE, MAP_SHARED, FileDescriptor, 0);
+  if (MMappedBuffer == MAP_FAILED) {
+    llvm::errs() << "Failed to mmap buffer.";
+  }
+}
+
+X86LbrCounter::~X86LbrCounter() { close(FileDescriptor); }
+
+void X86LbrCounter::start() override {
+  ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */);
+}
+
+int64_t X86LbrCounter::read() const override {
+  auto error = readOrError();
+  if (!error) {
+    return error.get();
+  }
+  llvm::errs() << "Error reading counter: " << error.takeError() << "\n";
+  return 0;
+}
+
+llvm::Expected<int64_t> X86LbrCounter::readOrError() const override {
+  // The max number of time-outs/retries before we give up.
+  static constexpr int kMaxTimeouts = 160;
+
+  // Parses the LBR buffer and fills CycleArray with the sequence of cycle
+  // counts from the buffer.
+  std::vector<int64_t> CycleArray;
+  std::unique_ptr<char[]> DataBuf(new char[kDataBufferSize]);
+  int NumTimeouts = 0;
+  int PollResult = 0;
+  while (PollResult <= 0) {
+    PollResult = pollLbrPerfEvent(FileDescriptor);
+    if (PollResult == -1) {
+      return llvm::make_error<llvm::StringError>("Cannot poll LBR perf event.",
+                                                 llvm::errc::io_error);
+    } else if (PollResult == 0) {
+      llvm::errs() << "LBR polling timed out without result, NumTimeouts ="
+                   << NumTimeouts << ". ";
+      if (NumTimeouts < kMaxTimeouts) {
+        llvm::errs() << "Retrying ...\n";
+        ++NumTimeouts;
+        continue;
+      } else {
+        llvm::errs() << "At max-timeouts. Giving up.\n";
+        return llvm::make_error<llvm::StringError>(
+            "LBR polling still timed out after max number of attempts.",
+            llvm::errc::device_or_resource_busy);
+      }
+    }
+  }
+
+  struct perf_event_mmap_page Page;
+  memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page));
+  const uint64_t DataTail = Page.data_tail;
+  const uint64_t DataHead = Page.data_head;
+  // We're supposed to use a barrier after reading data_head.
+  std::atomic_thread_fence(std::memory_order_acq_rel);
+  const size_t DataSize = DataHead - DataTail;
+  if (DataSize > kDataBufferSize) {
+    return llvm::make_error<llvm::StringError>(
+        "DataSize larger than buffer size.", llvm::errc::invalid_argument);
+  }
+  copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize);
+  llvm::Error error = parseDataBuffer(DataBuf.get(), DataSize, &CycleArray);
+  if (!error) {
+    // FIXME We should report the cycles count for all jumps,
+    // not just the most recent.
+    return CycleArray[0];
+  }
+
+  return error;
+}
+
+} // namespace exegesis
+} // namespace llvm
+
+#endif // HAVE_LIBPFM
+#endif // __linux__