diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst
--- a/llvm/docs/CommandGuide/llvm-exegesis.rst
+++ b/llvm/docs/CommandGuide/llvm-exegesis.rst
@@ -192,10 +192,24 @@
 
 .. option:: -mode=[latency|uops|inverse_throughput|analysis]
 
- Specify the run mode. Note that if you pick `analysis` mode, you also need
- to specify at least one of the `-analysis-clusters-output-file=` and
- `-analysis-inconsistencies-output-file=`.
+ Specify the run mode. Note that some modes have additional requirements and options.
 
+ `latency` mode can be  make use of either RDTSC or LBR.
+   `latency[LBR]` is only available on X86 (at least `Haswell`, but preferably
+   `Skylake` for more precise measurements). To run in this mode, a positive value
+   must be specified for `x86-lbr-sample-period`
+
+ In `analysis` mode, you also need to specify at least one of the
+ `-analysis-clusters-output-file=` and `-analysis-inconsistencies-output-file=`.
+
+.. option:: -x86-lbr-sample-period=<nBranches/sample>
+
+  Specify the sampling period - how many branches before we take a sample.
+  On choosing the "right" sampling period, a small value is preferred, but throttling
+  could occur if the sampling is too frequent. A prime number should be used to
+  avoid hiding blocks.
+  With various testing/experiments, `521` seems like the best value so far.
+  
 .. option:: -repetition-mode=[duplicate|loop|min]
 
  Specify the repetition mode. `duplicate` will create a large, straight line
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
@@ -20,6 +20,7 @@
 #include "BenchmarkResult.h"
 #include "LlvmState.h"
 #include "MCInstrDescView.h"
+#include "PerfHelper.h"
 #include "SnippetRepetitor.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Error.h"
@@ -33,16 +34,27 @@
 // Common code for all benchmark modes.
 class BenchmarkRunner {
 public:
+  // The argument to be passed to runConfiguration().
+  struct RunArg {
+    const BenchmarkCode &CodeSnippet;
+    unsigned NumRepetitions;
+    ArrayRef<std::unique_ptr<const SnippetRepetitor>> Repetitors;
+    bool DumpObjectToDisk;
+    unsigned LbrSamplePeriod;
+  };
   explicit BenchmarkRunner(const LLVMState &State,
                            InstructionBenchmark::ModeE Mode);
 
   virtual ~BenchmarkRunner();
 
+  // TODO(vyng) Make this "Deprecated" and switch caller to using RunArg
   Expected<InstructionBenchmark>
   runConfiguration(const BenchmarkCode &Configuration, unsigned NumRepetitions,
                    ArrayRef<std::unique_ptr<const SnippetRepetitor>> Repetitors,
                    bool DumpObjectToDisk) const;
 
+  Expected<InstructionBenchmark> runConfiguration(const RunArg &Arg) const;
+
   // Scratch space to run instructions that touch memory.
   struct ScratchSpace {
     static constexpr const size_t kAlignment = 1024;
@@ -66,6 +78,8 @@
   public:
     virtual ~FunctionExecutor();
     virtual Expected<int64_t> runAndMeasure(const char *Counters) const = 0;
+    // Optional description, if available.
+    virtual std::string description() const = 0;
   };
 
 protected:
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -7,18 +7,24 @@
 //===----------------------------------------------------------------------===//
 
 #include <array>
+#include <memory>
+#include <mutex>
 #include <string>
+#include <thread>
 
 #include "Assembler.h"
+#include "BenchmarkResult.h"
 #include "BenchmarkRunner.h"
 #include "Error.h"
 #include "MCInstrDescView.h"
 #include "PerfHelper.h"
+#include "Target.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/CrashRecoveryContext.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
@@ -41,6 +47,8 @@
       : Function(State.createTargetMachine(), std::move(Obj)),
         Scratch(Scratch) {}
 
+  std::string description() const override { return ""; }
+
 private:
   Expected<int64_t> runAndMeasure(const char *Counters) const override {
     // We sum counts when there are several counters for a single ProcRes
@@ -78,23 +86,154 @@
   const ExecutableFunction Function;
   BenchmarkRunner::ScratchSpace *const Scratch;
 };
+
+class LbrFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
+public:
+  LbrFunctionExecutorImpl(
+      const LLVMState &State,
+      llvm::object::OwningBinary<llvm::object::ObjectFile> Obj,
+      BenchmarkRunner::ScratchSpace *Scratch, unsigned LbrSamplePeriod)
+      : Target(State.getExegesisTarget()),
+        Function(State.createTargetMachine(), std::move(Obj)), Scratch(Scratch),
+        LbrSamplePeriod(LbrSamplePeriod) {}
+
+  std::string description() const override { return "LBR"; }
+  struct ThreadArg {
+    std::mutex Mutex;
+    pfm::Counter *CounterPtr;
+    std::condition_variable ConditionVariable;
+    bool Crashed;
+    std::string ErrMsg;
+    ~ThreadArg() {
+      if (CounterPtr != nullptr) {
+        delete CounterPtr;
+      }
+    }
+  };
+
+  void workerThread(struct ThreadArg *Arg) const {
+    pfm::PerfEvent PerfEvent("LBR", LbrSamplePeriod);
+    auto CounterOrError = Target.createCounter(PerfEvent);
+    if (!CounterOrError) {
+      std::lock_guard<std::mutex> Lock(Arg->Mutex);
+      Arg->Crashed = true;
+      llvm::errs() << "Error creating counter: " << CounterOrError.takeError();
+      Arg->ErrMsg = "Cannot create counter.";
+      Arg->ConditionVariable.notify_one();
+      return;
+    }
+    pfm::Counter *Counter = CounterOrError.get();
+    {
+      std::lock_guard<std::mutex> Lock(Arg->Mutex);
+      Arg->CounterPtr = Counter;
+    }
+    Arg->ConditionVariable.notify_one();
+    {
+      llvm::CrashRecoveryContext CRC;
+      llvm::CrashRecoveryContext::Enable();
+      const bool Crashed = !CRC.RunSafely([&]() {
+        Counter->start();
+        Function(Scratch->ptr());
+        Counter->stop();
+      });
+      llvm::CrashRecoveryContext::Disable();
+      {
+        std::lock_guard<std::mutex> Lock(Arg->Mutex);
+        Arg->CounterPtr = nullptr;
+      }
+      // FIXME: Better diagnosis.
+      if (Crashed) {
+        Arg->Crashed = true;
+        return;
+      }
+    }
+    Arg->Crashed = false;
+  }
+
+  llvm::Expected<int64_t> runAndMeasure(const char *) const override {
+    Scratch->clear();
+    struct ThreadArg Arg;
+    Arg.CounterPtr = nullptr;
+    std::thread Worker(&LbrFunctionExecutorImpl::workerThread, this, &Arg);
+    {
+      // Waits until the counter is ready or a "Crashed" signal if
+      // it could not be created.
+      std::unique_lock<std::mutex> Lock(Arg.Mutex);
+      Arg.ConditionVariable.wait(
+          Lock, [&Arg] { return Arg.Crashed || Arg.CounterPtr != nullptr; });
+    }
+    if (Arg.Crashed) {
+      return make_error<SnippetCrash>(
+          "Snippet crashed while running in thread. Reason: [" + Arg.ErrMsg +
+          "]");
+    }
+
+    auto ValueOrError = Arg.CounterPtr->readOrError();
+    Worker.join();
+    if (!ValueOrError) {
+      return ValueOrError.takeError();
+    }
+    return ValueOrError.get();
+  }
+
+  const ExegesisTarget &Target;
+  const ExecutableFunction Function;
+  BenchmarkRunner::ScratchSpace *const Scratch;
+  unsigned LbrSamplePeriod;
+};
+
+bool supportsLbr(const LLVMState &State) {
+  static const std::set<std::string> SupportedCpus{"haswell", "broadwell",
+                                                   "skylake", "kabylake"};
+  return SupportedCpus.find(State.getTargetMachine().getTargetCPU().str()) !=
+         SupportedCpus.end();
+}
+
+std::unique_ptr<BenchmarkRunner::FunctionExecutor>
+createFunctionExecutor(InstructionBenchmark::ModeE Mode, const LLVMState &State,
+                       object::OwningBinary<object::ObjectFile> ObjectFile,
+                       BenchmarkRunner::ScratchSpace *Scratch,
+                       const BenchmarkRunner::RunArg &Arg) {
+  if (Mode == InstructionBenchmark::ModeE::Latency) {
+    if (Arg.LbrSamplePeriod > 0) {
+      if (supportsLbr(State)) {
+        return std::make_unique<LbrFunctionExecutorImpl>(
+            State, std::move(ObjectFile), Scratch, Arg.LbrSamplePeriod);
+      } else {
+        llvm::errs() << "LBR not supported on given target["
+                     << State.getTargetMachine().getTargetCPU().str() << "].\n";
+        // Falls back to the default executor.
+      }
+    }
+  }
+  return std::make_unique<FunctionExecutorImpl>(State, std::move(ObjectFile),
+                                                Scratch);
+}
+
 } // namespace
 
 Expected<InstructionBenchmark> BenchmarkRunner::runConfiguration(
     const BenchmarkCode &BC, unsigned NumRepetitions,
     ArrayRef<std::unique_ptr<const SnippetRepetitor>> Repetitors,
     bool DumpObjectToDisk) const {
+  RunArg Arg{BC, NumRepetitions, Repetitors, DumpObjectToDisk,
+             0 /*LbrSamplePeriod*/};
+  return runConfiguration(Arg);
+}
+
+Expected<InstructionBenchmark>
+BenchmarkRunner::runConfiguration(const RunArg &Arg) const {
   InstructionBenchmark InstrBenchmark;
   InstrBenchmark.Mode = Mode;
   InstrBenchmark.CpuName = std::string(State.getTargetMachine().getTargetCPU());
   InstrBenchmark.LLVMTriple =
       State.getTargetMachine().getTargetTriple().normalize();
-  InstrBenchmark.NumRepetitions = NumRepetitions;
-  InstrBenchmark.Info = BC.Info;
+  InstrBenchmark.NumRepetitions = Arg.NumRepetitions;
+  InstrBenchmark.Info = Arg.CodeSnippet.Info;
 
-  const std::vector<MCInst> &Instructions = BC.Key.Instructions;
+  const std::vector<MCInst> &Instructions = Arg.CodeSnippet.Key.Instructions;
 
-  InstrBenchmark.Key = BC.Key;
+  InstrBenchmark.Key = Arg.CodeSnippet.Key;
 
   // If we end up having an error, and we've previously succeeded with
   // some other Repetitor, we want to discard the previous measurements.
@@ -112,7 +251,8 @@
   };
   ClearBenchmarkOnReturn CBOR(&InstrBenchmark);
 
-  for (const std::unique_ptr<const SnippetRepetitor> &Repetitor : Repetitors) {
+  for (const std::unique_ptr<const SnippetRepetitor> &Repetitor :
+       Arg.Repetitors) {
     // Assemble at least kMinInstructionsForSnippet instructions by repeating
     // the snippet for debug/analysis. This is so that the user clearly
     // understands that the inside instructions are repeated.
@@ -122,11 +262,13 @@
       raw_svector_ostream OS(Buffer);
       if (Error E = assembleToStream(
               State.getExegesisTarget(), State.createTargetMachine(),
-              BC.LiveIns, BC.Key.RegisterInitialValues,
+              Arg.CodeSnippet.LiveIns,
+              Arg.CodeSnippet.Key.RegisterInitialValues,
               Repetitor->Repeat(Instructions, kMinInstructionsForSnippet),
               OS)) {
         return std::move(E);
       }
+
       const ExecutableFunction EF(State.createTargetMachine(),
                                   getObjectFromBuffer(OS.str()));
       const auto FnBytes = EF.getFunctionBytes();
@@ -141,8 +283,8 @@
         Repetitor->Repeat(Instructions, InstrBenchmark.NumRepetitions);
 
     object::OwningBinary<object::ObjectFile> ObjectFile;
-    if (DumpObjectToDisk) {
-      auto ObjectFilePath = writeObjectFile(BC, Filler);
+    if (Arg.DumpObjectToDisk) {
+      auto ObjectFilePath = writeObjectFile(Arg.CodeSnippet, Filler);
       if (Error E = ObjectFilePath.takeError()) {
         InstrBenchmark.Error = toString(std::move(E));
         return InstrBenchmark;
@@ -155,15 +297,18 @@
       raw_svector_ostream OS(Buffer);
       if (Error E = assembleToStream(
               State.getExegesisTarget(), State.createTargetMachine(),
-              BC.LiveIns, BC.Key.RegisterInitialValues, Filler, OS)) {
+              Arg.CodeSnippet.LiveIns,
+              Arg.CodeSnippet.Key.RegisterInitialValues, Filler, OS)) {
         return std::move(E);
       }
       ObjectFile = getObjectFromBuffer(OS.str());
     }
 
-    const FunctionExecutorImpl Executor(State, std::move(ObjectFile),
-                                        Scratch.get());
-    auto NewMeasurements = runMeasurements(Executor);
+    std::unique_ptr<BenchmarkRunner::FunctionExecutor> Executor =
+        createFunctionExecutor(Mode, State, std::move(ObjectFile),
+                               Scratch.get(), Arg);
+
+    auto NewMeasurements = runMeasurements(*Executor);
     if (Error E = NewMeasurements.takeError()) {
       if (!E.isA<SnippetCrash>())
         return std::move(E);
@@ -183,7 +328,7 @@
       continue;
     }
 
-    assert(Repetitors.size() > 1 && !InstrBenchmark.Measurements.empty() &&
+    assert(Arg.Repetitors.size() > 1 && !InstrBenchmark.Measurements.empty() &&
            "We're in an 'min' repetition mode, and need to aggregate new "
            "result to the existing result.");
     assert(InstrBenchmark.Measurements.size() == NewMeasurements->size() &&
diff --git a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
--- a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
@@ -25,10 +25,24 @@
                          InstructionBenchmark::ModeE Mode);
   ~LatencyBenchmarkRunner() override;
 
+private:
+  Expected<std::vector<BenchmarkMeasure>>
+  runMeasurementsWithLbr(const FunctionExecutor &Executor) const;
+
+  Expected<std::vector<BenchmarkMeasure>>
+  runMeasurements(const FunctionExecutor &Executor) const override;
+};
+
+class LbrLatencyBenchmarkRunner : public BenchmarkRunner {
+public:
+  LbrLatencyBenchmarkRunner(const LLVMState &State);
+  ~LbrLatencyBenchmarkRunner() = default;
+
 private:
   Expected<std::vector<BenchmarkMeasure>>
   runMeasurements(const FunctionExecutor &Executor) const override;
 };
+
 } // namespace exegesis
 } // namespace llvm
 
diff --git a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
--- a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
@@ -41,10 +41,12 @@
   std::vector<BenchmarkMeasure> Result;
   switch (Mode) {
   case InstructionBenchmark::Latency:
-    Result = {BenchmarkMeasure::Create("latency", MinValue)};
+    Result = {BenchmarkMeasure::Create(
+        "latency[" + Executor.description() + "]", MinValue)};
     break;
   case InstructionBenchmark::InverseThroughput:
-    Result = {BenchmarkMeasure::Create("inverse_throughput", MinValue)};
+    Result = {BenchmarkMeasure::Create(
+        "inverse_throughput[" + Executor.description() + "]", MinValue)};
     break;
   default:
     break;
diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.h b/llvm/tools/llvm-exegesis/lib/PerfHelper.h
--- a/llvm/tools/llvm-exegesis/lib/PerfHelper.h
+++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.h
@@ -17,6 +17,8 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
 #include <functional>
 #include <memory>
 
@@ -36,7 +38,7 @@
 public:
   // http://perfmon2.sourceforge.net/manv4/libpfm.html
   // Events are expressed as strings. e.g. "INSTRUCTION_RETIRED"
-  explicit PerfEvent(StringRef pfm_event_string);
+  explicit PerfEvent(StringRef PfmEventString, unsigned SamplingPeriod = 0);
 
   PerfEvent(const PerfEvent &) = delete;
   PerfEvent(PerfEvent &&other);
@@ -55,7 +57,13 @@
   // e.g. "snb_ep::INSTRUCTION_RETIRED:e=0:i=0:c=0:t=0:u=1:k=0:mg=0:mh=1"
   StringRef getPfmEventString() const;
 
+  // Returns true if it should use LBR.
+  bool useLbr() const;
+
 private:
+  void initPerfEvent();
+  void initPerfEventForLbr(unsigned SamplingPeriod);
+
   const std::string EventString;
   std::string FullQualifiedEventString;
   perf_event_attr *Attr;
@@ -63,23 +71,32 @@
 
 // Uses a valid PerfEvent to configure the Kernel so we can measure the
 // underlying event.
-struct Counter {
+class Counter {
+public:
   // event: the PerfEvent to measure.
   explicit Counter(const PerfEvent &event);
 
   Counter(const Counter &) = delete;
   Counter(Counter &&other) = default;
 
-  ~Counter();
+  virtual ~Counter();
 
-  void start();         // Starts the measurement of the event.
-  void stop();          // Stops the measurement of the event.
-  int64_t read() const; // Return the current value of the counter.
+  /// Starts the measurement of the event.
+  virtual void start();
 
-private:
+  /// Stops the measurement of the event.
+  void stop();
+
+  /// Returns the current value of the counter.
+  virtual int64_t read() const;
+
+  /// Returns the current value of the counter or error if it cannot be read.
+  virtual llvm::Expected<int64_t> readOrError() const { return read(); }
+
+protected:
 #ifdef HAVE_LIBPFM
   int FileDescriptor = -1;
-#endif
+#endif // HAVE_LIBPFM
 };
 
 // Helper to measure a list of PerfEvent for a particular function.
diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
--- a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
+++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
@@ -8,13 +8,23 @@
 
 #include "PerfHelper.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
 #ifdef HAVE_LIBPFM
 #include "perfmon/perf_event.h"
 #include "perfmon/pfmlib.h"
 #include "perfmon/pfmlib_perf_event.h"
 #endif
+#include <atomic>
 #include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <poll.h>
+#include <sys/mman.h>
+#include <unistd.h>
 
 namespace llvm {
 namespace exegesis {
@@ -52,8 +62,7 @@
   Other.Attr = nullptr;
 }
 
-PerfEvent::PerfEvent(StringRef PfmEventString)
-    : EventString(PfmEventString.str()), Attr(nullptr) {
+void PerfEvent::initPerfEvent() {
 #ifdef HAVE_LIBPFM
   char *Fstr = nullptr;
   pfm_perf_encode_arg_t Arg = {};
@@ -77,10 +86,42 @@
 #endif
 }
 
+void PerfEvent::initPerfEventForLbr(unsigned SamplingPeriod) {
+#ifdef HAVE_LIBPFM
+  assert(SamplingPeriod > 0 && "SamplingPeriod must be positive");
+  Attr = new perf_event_attr();
+  *Attr = {0};
+  Attr->size = sizeof(*Attr);
+  Attr->type = PERF_TYPE_HARDWARE;
+  Attr->config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS;
+  Attr->sample_type = PERF_SAMPLE_BRANCH_STACK;
+  Attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_ANY;
+  Attr->sample_period = SamplingPeriod;
+  Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH.
+  Attr->disabled = 1;
+  Attr->exclude_kernel = 1;
+  Attr->exclude_hv = 1;
+  Attr->read_format = PERF_FORMAT_GROUP;
+
+  FullQualifiedEventString = "LBR";
+#endif
+}
+
+PerfEvent::PerfEvent(StringRef PfmEventString, unsigned SamplingPeriod)
+    : EventString(PfmEventString.str()), Attr(nullptr) {
+  if (EventString == "LBR") {
+    initPerfEventForLbr(SamplingPeriod);
+  } else {
+    initPerfEvent();
+  }
+}
+
 StringRef PerfEvent::name() const { return EventString; }
 
 bool PerfEvent::valid() const { return !FullQualifiedEventString.empty(); }
 
+bool PerfEvent::useLbr() const { return EventString == "LBR"; }
+
 const perf_event_attr *PerfEvent::attribute() const { return Attr; }
 
 StringRef PerfEvent::getPfmEventString() const {
diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h
--- a/llvm/tools/llvm-exegesis/lib/Target.h
+++ b/llvm/tools/llvm-exegesis/lib/Target.h
@@ -27,6 +27,7 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace exegesis {
@@ -65,6 +66,11 @@
   explicit ExegesisTarget(ArrayRef<CpuAndPfmCounters> CpuPfmCounters)
       : CpuPfmCounters(CpuPfmCounters) {}
 
+  virtual Expected<pfm::Counter *>
+  createCounter(const pfm::PerfEvent &Event) const {
+    return new pfm::Counter(Event);
+  }
+
   // Targets can use this to add target-specific passes in assembleToStream();
   virtual void addTargetSpecificPasses(PassManagerBase &PM) const {}
 
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -14,12 +14,31 @@
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86.h"
+#include "X86Counter.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
 
+#ifdef HAVE_LIBPFM
+#include "perfmon/perf_event.h"
+#include "perfmon/pfmlib.h"
+#include "perfmon/pfmlib_perf_event.h"
+#endif // HAVE_LIBPFM
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <poll.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <vector>
+
 namespace llvm {
 namespace exegesis {
 
@@ -559,10 +578,26 @@
 #include "X86GenExegesis.inc"
 
 namespace {
+
 class ExegesisX86Target : public ExegesisTarget {
 public:
   ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
 
+  Expected<pfm::Counter *>
+  createCounter(const pfm::PerfEvent &Event) const override {
+    // Can't use LBR without LIB PFM.
+    if (Event.useLbr()) {
+#ifdef HAVE_LIBPFM
+      return new X86LbrCounter(Event);
+#else
+      return llvm::make_error<llvm::StringError>(
+          "LBR counter requested without PFM package available..",
+          llvm::errc::invalid_argument);
+#endif
+    }
+    return new pfm::Counter(Event);
+  }
+
 private:
   void addTargetSpecificPasses(PassManagerBase &PM) const override;
 
diff --git a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h
@@ -0,0 +1,25 @@
+#include "../PerfHelper.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace exegesis {
+
+#ifdef HAVE_LIBPFM
+
+class X86LbrCounter : public pfm::Counter {
+public:
+  explicit X86LbrCounter(const pfm::PerfEvent &Event);
+
+  virtual ~X86LbrCounter();
+
+  void start() override;
+  int64_t read() const override;
+  llvm::Expected<int64_t> readOrError() const override;
+
+private:
+  void *MMappedBuffer = nullptr;
+};
+#endif // HAVE_LIBPFM
+
+} // namespace exegesis
+} // namespace llvm
diff --git a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp
@@ -0,0 +1,208 @@
+#include "X86Counter.h"
+
+#include "llvm/Support/Errc.h"
+
+#ifdef HAVE_LIBPFM
+#include "perfmon/perf_event.h"
+#include "perfmon/pfmlib.h"
+#include "perfmon/pfmlib_perf_event.h"
+#endif // HAVE_LIBPFM
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <poll.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <vector>
+
+#ifdef HAVE_LIBPFM
+namespace llvm {
+namespace exegesis {
+namespace {
+
+static constexpr int KBufferPages = 8;
+static const int kDataBufferSize = kBufferPages * getpagesize();
+
+// Waits for the LBR perf events.
+int pollLbrPerfEvent(const int FileDescriptor) {
+  struct pollfd PollFd;
+  PollFd.fd = FileDescriptor;
+  PollFd.events = POLLIN;
+  PollFd.revents = 0;
+  return poll(&PollFd, 1 /* num of fds */, 10000 /* time out */);
+}
+
+// Copies the data-buffer into Buf, given the pointer to MMapped.
+void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail,
+                    size_t DataSize) {
+  // First page is reserved for perf_event_mmap_page. Data buffer starts on
+  // the next page.
+  char *Start = reinterpret_cast<char *>(MMappedBuffer) + getpagesize();
+  // The LBR buffer is a cyclic buffer, we copy data to another buffer.
+  uint64_t Offset = Tail % kDataBufferSize;
+  size_t CopySize = kDataBufferSize - Offset;
+  memcpy(Buf, Start + Offset, CopySize);
+  if (CopySize >= DataSize)
+    return;
+  memcpy(Buf + CopySize, Start, Offset);
+}
+
+// Rewrites the jmp (from the BM loop) with `pop $rbx; ret` to get
+// out of the benchmarked code.
+//
+// pc: Must point at the start of the jmp instruction.
+void patchBasicBlockToEndBenchmarkedLoop(char *pc) const {
+  void *page = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(pc) &
+                                        ~(getpagesize() - 1));
+  mprotect(page, getpagesize(), PROT_READ | PROT_WRITE | PROT_EXEC);
+  if ((reinterpret_cast<uint64_t>(pc) & 0xf) != 0xf) {
+    // We have at least two bytes in the same cacheline, so we can use an
+    // atomic write to replace the jmp with pop, ret.
+    *reinterpret_cast<uint16_t *>(pc) = 0xc35b; // pop %rbx; ret;
+  } else {
+    // pc and pc+1 might be at different cacheline, so we first add the pop,
+    // ret instructions, and then update jump offset to jump to pop
+    // instruction.
+    *reinterpret_cast<uint16_t *>(pc + 5) = 0xc35b; // pop %rbx; ret;
+    // An automic update on the displacement.
+    if (pc[0] == 0xeb /* x86 short jump */) {
+      // offset is 1 byte
+      pc[1] = 0x03;
+    } else { /* x86 near jump */
+      // offset is 4 bytes
+      *reinterpret_cast<uint32_t *>(pc + 1) = 0;
+    }
+  }
+}
+
+// Parses the given data-buffer for stats and fill the CycleArray.
+// If data has been extracted successfully, also modifies the code to jump
+// out the benchmark loop.
+llvm::Error parseDataBuffer(const char const *DataBuf, size_t DataSize,
+                            std::vector<int64_t> *CycleArray,
+                            size_t Pos) const {
+  const char *DataPtr = DataBuf;
+  while (DataPtr < DataBuf + DataSize) {
+    struct perf_event_header Header;
+    memcpy(&Header, DataPtr, sizeof(struct perf_event_header));
+    if (Header.type != PERF_RECORD_SAMPLE) {
+      // Ignores non-sample records.
+      DataPtr += Header.size;
+      continue;
+    }
+    DataPtr += sizeof(Header);
+    uint64_t Count = llvm::support::endian::read64(DataPtr, support::native);
+    DataPtr += sizeof(Count);
+
+    struct perf_branch_entry Entry;
+    memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
+    // Read the perf_branch_entry array.
+    char *JumpPC = reinterpret_cast<char *>(Entry.from);
+    int64_t MinCycle = Entry.cycles;
+    for (int i = 0; i < Count; ++i) {
+      // We use the JumpPC from the entry with min cycle to avoid the entry
+      // that is returning from kernel.
+      if (MinCycle > Entry.cycles) {
+        MinCycle = Entry.cycles;
+        JumpPC = reinterpret_cast<char *>(Entry.from);
+      }
+      (*CycleArray)[Pos++] = Entry.cycles;
+      if (Pos == CycleArray->size()) {
+        patchBasicBlockToEndBenchmarkedLoop(JumpPC);
+        return llvm::Error::success();
+      }
+      // Advance to next entry
+      DataPtr += sizeof(Entry);
+      memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
+    }
+  }
+  return llvm::make_error<llvm::StringError>("Unable to read databuffer.",
+                                             llvm::errc::io_error);
+}
+
+} // namespace
+
+X86LbrCounter::X86LbrCounter(const pfm::PerfEvent &Event) : Counter(Event) {
+  // First page is reserved for perf_event_mmap_page. Data buffer starts on
+  // the next page, so we allocate one more page.
+  MMappedBuffer = mmap(nullptr, (kBufferPages + 1) * getpagesize(),
+                       PROT_READ | PROT_WRITE, MAP_SHARED, FileDescriptor, 0);
+  if (MMappedBuffer == MAP_FAILED) {
+    llvm::errs() << "Failed to mmap buffer.";
+  }
+}
+
+X86LbrCounter::~X86LbrCounter() { close(FileDescriptor); }
+
+void X86LbrCounter::start() override {
+  ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */);
+}
+
+int64_t X86LbrCounter::read() const override {
+  auto error = readOrError();
+  if (!error) {
+    return error.get();
+  }
+  llvm::errs() << "Error reading counter: " << error.takeError() << "\n";
+  return 0;
+}
+
+llvm::Expected<int64_t> X86LbrCounter::readOrError() const override {
+  // The max number of time-outs/retries before we give up.
+  static constexpr int kMaxTimeouts = 160;
+
+  // Parses the LBR buffer and fills CycleArray with the sequence of cycle
+  // counts from the buffer.
+  std::vector<int64_t> CycleArray;
+  std::unique_ptr<char[]> DataBuf(new char[kDataBufferSize]);
+  size_t Pos = 0;
+  int NumTimeouts = 0;
+  while (true) {
+    const int PollResult = pollLbrPerfEvent(FileDescriptor);
+    if (PollResult == -1) {
+      return llvm::make_error<llvm::StringError>("Cannot poll LBR perf event.",
+                                                 llvm::errc::io_error);
+    } else if (PollResult == 0) {
+      llvm::errs() << "LBR polling timed out without result, NumTimeouts ="
+                   << NumTimeouts << ". ";
+      if (NumTimeouts < kMaxTimeouts) {
+        llvm::errs() << "Retrying ...\n";
+        ++NumTimeouts;
+        continue;
+      } else {
+        llvm::errs() << "At max-timeouts. Giving up.\n";
+        return llvm::make_error<llvm::StringError>(
+            "LBR polling still timed out after max number of attempts.",
+            llvm::errc::device_or_resource_busy);
+      }
+    }
+
+    struct perf_event_mmap_page Page;
+    memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page));
+    const uint64_t DataTail = Page.data_tail;
+    const uint64_t DataHead = Page.data_head;
+    // We're supposed to use a barrier after reading data_head.
+    std::atomic_thread_fence(std::memory_order_acq_rel);
+    const size_t DataSize = DataHead - DataTail;
+    if (DataSize > kDataBufferSize) {
+      return llvm::make_error<llvm::StringError>(
+          "DataSize larger than buffer size.", llvm::errc::invalid_argument);
+    }
+    copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize);
+    llvm::Error error =
+        parseDataBuffer(DataBuf.get(), DataSize, &CycleArray, Pos);
+    if (!error) {
+      // TODO(vyng) Analyse the array and get proper value.
+      return CycleArray[0];
+    }
+  }
+  return llvm::make_error<llvm::StringError>("Unknown error.",
+                                             llvm::errc::io_error);
+}
+
+} // namespace exegesis
+} // namespace llvm
+
+#endif // HAVE_LIBPFM
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -83,6 +83,20 @@
                clEnumValN(exegesis::InstructionBenchmark::Unknown, "analysis",
                           "Analysis")));
 
+// If a positive value is specified, we are going to use the LBR in
+// latency-mode.
+//
+// Note:
+//  -  A small value is preferred, but too low a value could result in
+//     throttling.
+//  -  A prime number is preferred to avoid hiding blocks.
+//
+// Based on various testings, `521` seems like the recommended value so far.
+static cl::opt<unsigned> LbrSamplePeriod(
+    "x86-lbr-sample-period",
+    cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
+    cl::cat(BenchmarkOptions), cl::init(0));
+
 static cl::opt<exegesis::InstructionBenchmark::RepetitionModeE> RepetitionMode(
     "repetition-mode", cl::desc("how to repeat the instruction snippet"),
     cl::cat(BenchmarkOptions),
@@ -332,6 +346,14 @@
     Configurations = ExitOnErr(readSnippets(State, SnippetsFile));
   }
 
+#if !defined(__x86_64__) && !defined(__x86__) && !defined(__i386__)
+  if (exegesis::BenchmarkMode == exegesis::InstructionBenchmark::Latency &&
+      LbrSamplePeriod > 0) {
+    ExitOnErr.setBanner("llvm-exegesis: ");
+    ExitWithError("LBR mode must be run on x86 arch.");
+  }
+#endif
+
   if (NumRepetitions == 0) {
     ExitOnErr.setBanner("llvm-exegesis: ");
     ExitWithError("--num-repetitions must be greater than zero");
@@ -342,8 +364,10 @@
     BenchmarkFile = "-";
 
   for (const BenchmarkCode &Conf : Configurations) {
-    InstructionBenchmark Result = ExitOnErr(Runner->runConfiguration(
-        Conf, NumRepetitions, Repetitors, DumpObjectToDisk));
+    BenchmarkRunner::RunArg RunArg{Conf, NumRepetitions, Repetitors,
+                                   DumpObjectToDisk, LbrSamplePeriod};
+
+    InstructionBenchmark Result = ExitOnErr(Runner->runConfiguration(RunArg));
     ExitOnFileError(BenchmarkFile, Result.writeYaml(State, BenchmarkFile));
   }
   exegesis::pfm::pfmTerminate();