diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst
--- a/llvm/docs/CommandGuide/llvm-exegesis.rst
+++ b/llvm/docs/CommandGuide/llvm-exegesis.rst
@@ -190,11 +190,19 @@
  Specify the custom code snippet to measure. See example 2 for details.
  Either `opcode-index`, `opcode-name` or `snippets-file` must be set.
 
-.. option:: -mode=[latency|uops|inverse_throughput|analysis]
+.. option:: -mode=[latency|lbr_latency|uops|inverse_throughput|analysis]
+
+ Specify the run mode. Note that some modes have additional requirements.
+
+ `lbr_latency` mode makes use of LBR, which, starting with Skylake, contains the
+ precise number of cycles between the two consecutive branches. This will be
+ significantly more precise than the method using RDTSC. This mode should be run
+ with at least `Haswell`, but preferably `Skylake` for more precise measurements.
+ Using `lrb_latency` requires setting `lbr-sample-period`.
+
+ In `analysis` mode, you also need to specify at least one of the
+ `-analysis-clusters-output-file=` and `-analysis-inconsistencies-output-file=`.
 
- Specify the run mode. Note that if you pick `analysis` mode, you also need
- to specify at least one of the `-analysis-clusters-output-file=` and
- `-analysis-inconsistencies-output-file=`.
 
 .. option:: -repetition-mode=[duplicate|loop|min]
 
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
@@ -59,7 +59,15 @@
 // The result of an instruction benchmark.
 struct InstructionBenchmark {
   InstructionBenchmarkKey Key;
-  enum ModeE { Unknown, Latency, Uops, InverseThroughput };
+  enum ModeE {
+    Unknown,
+    Latency,
+    Uops,
+    InverseThroughput,
+    // LbrLatency mode is only available on x86.
+    // Must be used with at least, Haswell.
+    LbrLatency,
+  };
   ModeE Mode;
   std::string CpuName;
   std::string LLVMTriple;
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
@@ -20,6 +20,7 @@
 #include "BenchmarkResult.h"
 #include "LlvmState.h"
 #include "MCInstrDescView.h"
+#include "PerfHelper.h"
 #include "SnippetRepetitor.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Error.h"
@@ -33,16 +34,26 @@
 // Common code for all benchmark modes.
 class BenchmarkRunner {
 public:
+  struct RunArg {
+    const BenchmarkCode &CodeSnippet;
+    unsigned NumRepetitions;
+    ArrayRef<std::unique_ptr<const SnippetRepetitor>> Repetitors;
+    bool DumpObjectToDisk;
+    unsigned LbrSamplePeriod;
+  };
   explicit BenchmarkRunner(const LLVMState &State,
                            InstructionBenchmark::ModeE Mode);
 
   virtual ~BenchmarkRunner();
 
+  // TODO(vyng) Make this "Deprecated" and switch caller to using RunArg
   Expected<InstructionBenchmark>
   runConfiguration(const BenchmarkCode &Configuration, unsigned NumRepetitions,
                    ArrayRef<std::unique_ptr<const SnippetRepetitor>> Repetitors,
                    bool DumpObjectToDisk) const;
 
+  Expected<InstructionBenchmark> runConfiguration(const RunArg &Arg) const;
+
   // Scratch space to run instructions that touch memory.
   struct ScratchSpace {
     static constexpr const size_t kAlignment = 1024;
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -7,9 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include <array>
+#include <memory>
+#include <mutex>
 #include <string>
+#include <thread>
 
 #include "Assembler.h"
+#include "BenchmarkResult.h"
 #include "BenchmarkRunner.h"
 #include "Error.h"
 #include "MCInstrDescView.h"
@@ -19,6 +23,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/CrashRecoveryContext.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
@@ -78,23 +83,119 @@
   const ExecutableFunction Function;
   BenchmarkRunner::ScratchSpace *const Scratch;
 };
+
+class LbrFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
+public:
+  LbrFunctionExecutorImpl(
+      const LLVMState &State,
+      llvm::object::OwningBinary<llvm::object::ObjectFile> Obj,
+      BenchmarkRunner::ScratchSpace *Scratch, unsigned LbrSamplePeriod)
+      : Function(State.createTargetMachine(), std::move(Obj)), Scratch(Scratch),
+        LbrSamplePeriod(LbrSamplePeriod) {}
+
+private:
+  struct ThreadArg {
+    std::mutex Mutex;
+    pfm::Counter *CounterPtr;
+    std::condition_variable ConditionVariable;
+    bool Crashed;
+  };
+
+  void workerThread(ThreadArg *Arg) const {
+    pfm::PerfEvent PerfEvent("LBR", LbrSamplePeriod);
+    pfm::Counter Counter = State.getExegesisTarget().createCounter(PerfEvent);
+    {
+      std::lock_guard<std::mutex> Lock(Arg->Mutex);
+      Arg->CounterPtr = &Counter;
+    }
+    Arg->ConditionVariable.notify_one();
+    {
+      llvm::CrashRecoveryContext CRC;
+      llvm::CrashRecoveryContext::Enable();
+      const bool Crashed = !CRC.RunSafely([&]() {
+        Counter.start();
+        Function(Scratch->ptr());
+        Counter.stop();
+      });
+      llvm::CrashRecoveryContext::Disable();
+      {
+        std::lock_guard<std::mutex> Lock(Arg->Mutex);
+        Arg->CounterPtr = nullptr;
+      }
+      // FIXME: Better diagnosis.
+      if (Crashed) {
+        Arg->Crashed = true;
+        return;
+      }
+    }
+    Arg->Crashed = false;
+  }
+
+  llvm::Expected<int64_t> runAndMeasure(const char *) const override {
+    Scratch->clear();
+    struct ThreadArg Arg;
+    Arg.CounterPtr = nullptr;
+    std::thread Worker(&LbrFunctionExecutorImpl::workerThread, this, &Arg);
+    {
+      // Waits until the counter is ready.
+      std::unique_lock<std::mutex> Lock(Arg.Mutex);
+      Arg.ConditionVariable.wait(Lock,
+                                 [&Arg] { return Arg.CounterPtr != nullptr; });
+    }
+    if (Arg.Crashed) {
+      return make_error<SnippetCrash>(
+          "Snippet crashed while running in thread.");
+    }
+
+    auto valueOrError = Arg.CounterPtr->readOrError();
+    Worker.join();
+    if (ValueOrError) {
+      return ValueOrError.takeError();
+    }
+    return ValueOrError.get();
+  }
+
+  const ExecutableFunction Function;
+  BenchmarkRunner::ScratchSpace *const Scratch;
+  unsigned LbrSamplePeriod;
+};
+
+std::unique_ptr<BenchmarkRunner::FunctionExecutor>
+createFunctionExecutor(InstructionBenchmark::ModeE Mode, const LLVMState &State,
+                       object::OwningBinary<object::ObjectFile> ObjectFile,
+                       BenchmarkRunner::ScratchSpace *Scratch,
+                       const BenchmarkRunner::RunArg &Arg) {
+  if (Mode == InstructionBenchmark::ModeE::LbrLatency) {
+    return std::make_unique<LbrFunctionExecutorImpl>(
+        State, std::move(ObjectFile), Scratch, Arg.LbrSamplePeriod);
+  }
+  return std::make_unique<FunctionExecutorImpl>(State, std::move(ObjectFile),
+                                                Scratch);
+}
+
 } // namespace
 
 Expected<InstructionBenchmark> BenchmarkRunner::runConfiguration(
     const BenchmarkCode &BC, unsigned NumRepetitions,
     ArrayRef<std::unique_ptr<const SnippetRepetitor>> Repetitors,
     bool DumpObjectToDisk) const {
+  RunArg Arg{BC, NumRepetitions, Repetitors, DumpObjectToDisk};
+  return runConfiguration(Arg);
+}
+
+Expected<InstructionBenchmark>
+BenchmarkRunner::runConfiguration(const RunArg &Arg) const {
   InstructionBenchmark InstrBenchmark;
   InstrBenchmark.Mode = Mode;
   InstrBenchmark.CpuName = std::string(State.getTargetMachine().getTargetCPU());
   InstrBenchmark.LLVMTriple =
       State.getTargetMachine().getTargetTriple().normalize();
-  InstrBenchmark.NumRepetitions = NumRepetitions;
-  InstrBenchmark.Info = BC.Info;
+  InstrBenchmark.NumRepetitions = Arg.NumRepetitions;
+  InstrBenchmark.Info = Arg.CodeSnippet.Info;
 
-  const std::vector<MCInst> &Instructions = BC.Key.Instructions;
+  const std::vector<MCInst> &Instructions = Arg.CodeSnippet.Key.Instructions;
 
-  InstrBenchmark.Key = BC.Key;
+  InstrBenchmark.Key = Arg.CodeSnippet.Key;
 
   // If we end up having an error, and we've previously succeeded with
   // some other Repetitor, we want to discard the previous measurements.
@@ -112,7 +213,8 @@
   };
   ClearBenchmarkOnReturn CBOR(&InstrBenchmark);
 
-  for (const std::unique_ptr<const SnippetRepetitor> &Repetitor : Repetitors) {
+  for (const std::unique_ptr<const SnippetRepetitor> &Repetitor :
+       Arg.Repetitors) {
     // Assemble at least kMinInstructionsForSnippet instructions by repeating
     // the snippet for debug/analysis. This is so that the user clearly
     // understands that the inside instructions are repeated.
@@ -122,11 +224,13 @@
       raw_svector_ostream OS(Buffer);
       if (Error E = assembleToStream(
               State.getExegesisTarget(), State.createTargetMachine(),
-              BC.LiveIns, BC.Key.RegisterInitialValues,
+              Arg.CodeSnippet.LiveIns,
+              Arg.CodeSnippet.Key.RegisterInitialValues,
               Repetitor->Repeat(Instructions, kMinInstructionsForSnippet),
               OS)) {
         return std::move(E);
       }
+
       const ExecutableFunction EF(State.createTargetMachine(),
                                   getObjectFromBuffer(OS.str()));
       const auto FnBytes = EF.getFunctionBytes();
@@ -142,7 +246,7 @@
 
     object::OwningBinary<object::ObjectFile> ObjectFile;
     if (DumpObjectToDisk) {
-      auto ObjectFilePath = writeObjectFile(BC, Filler);
+      auto ObjectFilePath = writeObjectFile(Arg.CodeSnippet, Filler);
       if (Error E = ObjectFilePath.takeError()) {
         InstrBenchmark.Error = toString(std::move(E));
         return InstrBenchmark;
@@ -155,15 +259,18 @@
       raw_svector_ostream OS(Buffer);
       if (Error E = assembleToStream(
               State.getExegesisTarget(), State.createTargetMachine(),
-              BC.LiveIns, BC.Key.RegisterInitialValues, Filler, OS)) {
+              Arg.CodeSnippet.LiveIns,
+              Arg.CodeSnippet.Key.RegisterInitialValues, Filler, OS)) {
         return std::move(E);
       }
       ObjectFile = getObjectFromBuffer(OS.str());
     }
 
-    const FunctionExecutorImpl Executor(State, std::move(ObjectFile),
-                                        Scratch.get());
-    auto NewMeasurements = runMeasurements(Executor);
+    std::unique_ptr<BenchmarkRunner::FunctionExecutor> Executor =
+        createFunctionExecutor(Mode, State, std::move(ObjectFile),
+                               Scratch.get(), Arg);
+
+    auto NewMeasurements = runMeasurements(*Executor);
     if (Error E = NewMeasurements.takeError()) {
       if (!E.isA<SnippetCrash>())
         return std::move(E);
@@ -183,7 +290,7 @@
       continue;
     }
 
-    assert(Repetitors.size() > 1 && !InstrBenchmark.Measurements.empty() &&
+    assert(Arg.Repetitors.size() > 1 && !InstrBenchmark.Measurements.empty() &&
            "We're in an 'min' repetition mode, and need to aggregate new "
            "result to the existing result.");
     assert(InstrBenchmark.Measurements.size() == NewMeasurements->size() &&
diff --git a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
--- a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
@@ -29,6 +29,17 @@
   Expected<std::vector<BenchmarkMeasure>>
   runMeasurements(const FunctionExecutor &Executor) const override;
 };
+
+class LbrLatencyBenchmarkRunner : public BenchmarkRunner {
+public:
+  LbrLatencyBenchmarkRunner(const LLVMState &State);
+  ~LbrLatencyBenchmarkRunner() = default;
+
+private:
+  Expected<std::vector<BenchmarkMeasure>>
+  runMeasurements(const FunctionExecutor &Executor) const override;
+};
+
 } // namespace exegesis
 } // namespace llvm
 
diff --git a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
--- a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
@@ -52,5 +52,20 @@
   return std::move(Result);
 }
 
+LbrLatencyBenchmarkRunner::LbrLatencyBenchmarkRunner(const LLVMState &State)
+    : BenchmarkRunner(State, InstructionBenchmark::LbrLatency) {}
+
+llvm::Expected<std::vector<BenchmarkMeasure>>
+LbrLatencyBenchmarkRunner::runMeasurements(
+    const FunctionExecutor &Executor) const {
+  // TODO(vyng) Maybe verify that we're running with the right CPU/config
+  auto ExpectedCounterValue = Executor.runAndMeasure("LBR");
+  if (!ExpectedCounterValue)
+    return ExpectedCounterValue.takeError();
+  std::vector<BenchmarkMeasure> Result = {
+      BenchmarkMeasure::Create("lbr-latency", *ExpectedCounterValue)};
+  return std::move(Result);
+}
+
 } // namespace exegesis
 } // namespace llvm
diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.h b/llvm/tools/llvm-exegesis/lib/PerfHelper.h
--- a/llvm/tools/llvm-exegesis/lib/PerfHelper.h
+++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.h
@@ -17,6 +17,8 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
 #include <functional>
 #include <memory>
 
@@ -36,7 +38,7 @@
 public:
   // http://perfmon2.sourceforge.net/manv4/libpfm.html
   // Events are expressed as strings. e.g. "INSTRUCTION_RETIRED"
-  explicit PerfEvent(StringRef pfm_event_string);
+  explicit PerfEvent(StringRef PfmEventString, unsigned SamplingPeriod = 0);
 
   PerfEvent(const PerfEvent &) = delete;
   PerfEvent(PerfEvent &&other);
@@ -55,7 +57,14 @@
   // e.g. "snb_ep::INSTRUCTION_RETIRED:e=0:i=0:c=0:t=0:u=1:k=0:mg=0:mh=1"
   StringRef getPfmEventString() const;
 
+  // Returns true if it should use LBR.
+  bool useLbr() const;
+
 private:
+  void initPerfEvent();
+  void initPerfEventForLbr();
+  unsigned LbrSamplePeriod;
+
   const std::string EventString;
   std::string FullQualifiedEventString;
   perf_event_attr *Attr;
@@ -63,23 +72,32 @@
 
 // Uses a valid PerfEvent to configure the Kernel so we can measure the
 // underlying event.
-struct Counter {
+class Counter {
+public:
   // event: the PerfEvent to measure.
   explicit Counter(const PerfEvent &event);
 
   Counter(const Counter &) = delete;
   Counter(Counter &&other) = default;
 
-  ~Counter();
+  virtual ~Counter();
 
-  void start();         // Starts the measurement of the event.
-  void stop();          // Stops the measurement of the event.
-  int64_t read() const; // Return the current value of the counter.
+  /// Starts the measurement of the event.
+  virtual void start();
 
-private:
+  /// Stops the measurement of the event.
+  void stop();
+
+  /// Returns the current value of the counter.
+  virtual int64_t read() const;
+
+  /// Returns the current value of the counter or error if it cannot be read.
+  virtual llvm::Expected<int64_t> readOrError() const { return read(); }
+
+protected:
 #ifdef HAVE_LIBPFM
   int FileDescriptor = -1;
-#endif
+#endif // HAVE_LIBPFM
 };
 
 // Helper to measure a list of PerfEvent for a particular function.
diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
--- a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
+++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
@@ -8,13 +8,23 @@
 
 #include "PerfHelper.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
 #ifdef HAVE_LIBPFM
 #include "perfmon/perf_event.h"
 #include "perfmon/pfmlib.h"
 #include "perfmon/pfmlib_perf_event.h"
 #endif
+#include <atomic>
 #include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <poll.h>
+#include <sys/mman.h>
+#include <unistd.h>
 
 namespace llvm {
 namespace exegesis {
@@ -46,14 +56,14 @@
 }
 
 PerfEvent::PerfEvent(PerfEvent &&Other)
-    : EventString(std::move(Other.EventString)),
+    : LbrSamplePeriod(Other.LbrSamplePeriod),
+      EventString(std::move(Other.EventString)),
       FullQualifiedEventString(std::move(Other.FullQualifiedEventString)),
       Attr(Other.Attr) {
   Other.Attr = nullptr;
 }
 
-PerfEvent::PerfEvent(StringRef PfmEventString)
-    : EventString(PfmEventString.str()), Attr(nullptr) {
+void PerfEvent::initPerfEvent() {
 #ifdef HAVE_LIBPFM
   char *Fstr = nullptr;
   pfm_perf_encode_arg_t Arg = {};
@@ -77,10 +87,40 @@
 #endif
 }
 
+void PerfEvent::initPerfEventForLbr() {
+  Attr = new perf_event_attr();
+  *Attr = {0};
+  Attr->size = sizeof(*Attr);
+  Attr->type = PERF_TYPE_HARDWARE;
+  Attr->config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS;
+  Attr->sample_type = PERF_SAMPLE_BRANCH_STACK;
+  Attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_ANY;
+  Attr->sample_period = LbrSamplePeriod;
+  Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH.
+  Attr->disabled = 1;
+  Attr->exclude_kernel = 1;
+  Attr->exclude_hv = 1;
+  Attr->read_format = PERF_FORMAT_GROUP;
+
+  FullQualifiedEventString = "LBR";
+}
+
+PerfEvent::PerfEvent(StringRef PfmEventString, unsigned SamplingPeriod)
+    : LbrSamplePeriod(SamplingPeriod), EventString(PfmEventString.str()),
+      Attr(nullptr) {
+  if (EventString == "LBR") {
+    initPerfEventForLbr();
+  } else {
+    initPerfEvent();
+  }
+}
+
 StringRef PerfEvent::name() const { return EventString; }
 
 bool PerfEvent::valid() const { return !FullQualifiedEventString.empty(); }
 
+bool PerfEvent::useLbr() const { return EventString == "LBR"; }
+
 const perf_event_attr *PerfEvent::attribute() const { return Attr; }
 
 StringRef PerfEvent::getPfmEventString() const {
diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h
--- a/llvm/tools/llvm-exegesis/lib/Target.h
+++ b/llvm/tools/llvm-exegesis/lib/Target.h
@@ -27,6 +27,7 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace exegesis {
@@ -65,6 +66,11 @@
   explicit ExegesisTarget(ArrayRef<CpuAndPfmCounters> CpuPfmCounters)
       : CpuPfmCounters(CpuPfmCounters) {}
 
+  virtual Expected<pfm::Counter>
+  createCounter(const pfm::PerfEvent &Event) const {
+    return pfm::Counter(Event);
+  }
+
   // Targets can use this to add target-specific passes in assembleToStream();
   virtual void addTargetSpecificPasses(PassManagerBase &PM) const {}
 
@@ -173,6 +179,8 @@
       const LLVMState &State, InstructionBenchmark::ModeE Mode) const;
   std::unique_ptr<BenchmarkRunner> virtual createUopsBenchmarkRunner(
       const LLVMState &State) const;
+  std::unique_ptr<BenchmarkRunner> virtual createLbrLatencyBenchmarkRunner(
+      const LLVMState &State) const;
 
   const ExegesisTarget *Next = nullptr;
   const ArrayRef<CpuAndPfmCounters> CpuPfmCounters;
diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp
--- a/llvm/tools/llvm-exegesis/lib/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Target.cpp
@@ -49,6 +49,8 @@
   case InstructionBenchmark::Uops:
   case InstructionBenchmark::InverseThroughput:
     return createParallelSnippetGenerator(State, Opts);
+  case InstructionBenchmark::LbrLatency:
+    return createSerialSnippetGenerator(State, Opts);
   }
   return nullptr;
 }
@@ -77,6 +79,8 @@
       return make_error<Failure>("can't run 'uops' mode, sched model does not "
                                  "define uops or issue counters.");
     return createUopsBenchmarkRunner(State);
+  case InstructionBenchmark::LbrLatency:
+    return createLbrLatencyBenchmarkRunner(State);
   }
   return nullptr;
 }
@@ -101,6 +105,16 @@
   return std::make_unique<UopsBenchmarkRunner>(State);
 }
 
+std::unique_ptr<BenchmarkRunner>
+ExegesisTarget::createLbrLatencyBenchmarkRunner(const LLVMState &State) const {
+#if defined(__x86_64__) || defined(__x86__) || defined(__i386__)
+  return std::make_unique<LbrLatencyBenchmarkRunner>(State);
+#else
+#warning "LBR benchmark runner being created for non-X86 arch."
+#endif
+  return nullptr;
+}
+
 static_assert(std::is_pod<PfmCountersInfo>::value,
               "We shouldn't have dynamic initialization here");
 const PfmCountersInfo PfmCountersInfo::Default = {nullptr, nullptr, nullptr,
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -18,7 +18,23 @@
 #include "X86Subtarget.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
+#include <memory>
+#include <vector>
+#ifdef HAVE_LIBPFM
+#include "perfmon/perf_event.h"
+#include "perfmon/pfmlib.h"
+#include "perfmon/pfmlib_perf_event.h"
+#endif // HAVE_LIBPFM
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <poll.h>
+#include <sys/mman.h>
+#include <unistd.h>
 
 namespace llvm {
 namespace exegesis {
@@ -559,10 +575,181 @@
 #include "X86GenExegesis.inc"
 
 namespace {
+#ifdef HAVE_LIBPFM
+class X86LbrCounter : public pfm::Counter {
+public:
+  explicit X86LbrCounter(const pfm::PerfEvent &Event) : Counter(Event) {
+    // First page is reserved for perf_event_mmap_page. Data buffer starts on
+    // the next page, so we allocate one more page.
+    MMappedBuffer = mmap(nullptr, (kBufferPages + 1) * getpagesize(),
+                         PROT_READ | PROT_WRITE, MAP_SHARED, FileDescriptor, 0);
+    if (MMappedBuffer == MAP_FAILED) {
+      llvm::errs() << "Failed to mmap buffer.";
+    }
+  }
+  virtual ~X86LbrCounter() {}
+
+  void start() override {
+    ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */);
+  }
+
+  int64_t read() const override {
+    auto error = readOrError();
+    if (!error) {
+      return error.get();
+    }
+    llvm::errs() << "Error reading counter: " << error.takeError() << "\n";
+    return 0;
+  }
+
+  llvm::Expected<int64_t> readOrError() const override {
+    // Parses the LBR buffer and fills CycleArray with the sequence of cycle
+    // counts from the buffer.
+    std::vector<int64_t> CycleArray;
+    std::unique_ptr<char[]> DataBuf(new char[DataBufferSize]);
+    size_t Pos = 0;
+    for (;;) {
+      if (pollLbrPerfEvent() == -1) {
+        return llvm::make_error<llvm::StringError>(
+            "Cannot poll LBR perf event.", llvm::errc::io_error);
+      }
+      // First page is reserved for perf_event_mmap_page. Data buffer starts on
+      // the next page.
+      struct perf_event_mmap_page Page;
+      memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page));
+      const uint64_t DataTail = Page.data_tail;
+      const uint64_t DataHead = Page.data_head;
+      // We're supposed to use a barrier after reading data_head.
+      std::atomic_thread_fence(std::memory_order_acq_rel);
+      const size_t DataSize = DataHead - DataTail;
+      if (DataSize > DataBufferSize) {
+        return llvm::make_error<llvm::StringError>(
+            "DataSize larger than buffer size.", llvm::errc::invalid_argument);
+      }
+      copyDataBuffer(DataBuf.get(), DataTail, DataSize);
+      llvm::Error error =
+          readDataBuffer(DataBuf.get(), DataSize, &CycleArray, Pos);
+      if (!error) {
+        // TODO(vyng) Analyse the array and get proper value.
+        return CycleArray[0];
+      }
+    }
+    return llvm::make_error<llvm::StringError>("Unknown error.",
+                                               llvm::errc::io_error);
+  }
+
+private:
+  // Waits for the LBR perf events.
+  int pollLbrPerfEvent() const {
+    struct pollfd PollFd;
+    PollFd.fd = FileDescriptor;
+    PollFd.events = POLLIN;
+    PollFd.revents = 0;
+    return poll(&PollFd, 1 /* num of fds */, 10000 /* time out */);
+  }
+
+  void copyDataBuffer(char *Buf, uint64_t Tail, size_t DataSize) const {
+    // First page is reserved for perf_event_mmap_page. Data buffer starts on
+    // the next page.
+    char *Start = reinterpret_cast<char *>(MMappedBuffer) + getpagesize();
+    // The LBR buffer is a cyclic buffer, we copy data to another buffer.
+    uint64_t Offset = Tail % DataBufferSize;
+    size_t CopySize = DataBufferSize - Offset;
+    memcpy(Buf, Start + Offset, CopySize);
+    if (CopySize >= DataSize)
+      return;
+    memcpy(Buf + CopySize, Start, Offset);
+  }
+
+  llvm::Error readDataBuffer(const char *DataBuf, size_t DataSize,
+                             std::vector<int64_t> *CycleArray,
+                             size_t Pos) const {
+    const char *DataPtr = DataBuf;
+    while (DataPtr < DataBuf + DataSize) {
+      struct perf_event_header Header;
+      memcpy(&Header, DataPtr, sizeof(struct perf_event_header));
+      if (Header.type != PERF_RECORD_SAMPLE) {
+        // Ignores non-sample records.
+        DataPtr += Header.size;
+        continue;
+      }
+      DataPtr += sizeof(Header);
+      uint64_t Count = llvm::support::endian::read64(DataPtr, support::native);
+      DataPtr += sizeof(Count);
+
+      struct perf_branch_entry Entry;
+      memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
+      // Read the perf_branch_entry array.
+      char *JumpPC = reinterpret_cast<char *>(Entry.from);
+      int64_t MinCycle = Entry.cycles;
+      for (int i = 0; i < Count; ++i) {
+        // We use the JumpPC from the entry with min cycle to avoid the entry
+        // that is returning from kernel.
+        if (MinCycle > Entry.cycles) {
+          MinCycle = Entry.cycles;
+          JumpPC = reinterpret_cast<char *>(Entry.from);
+        }
+        (*CycleArray)[Pos++] = Entry.cycles;
+        if (Pos == CycleArray->size()) {
+          patchBasicBlockToEndBenchmarkedLoop(JumpPC);
+          return llvm::Error::success();
+        }
+        // Advance to next entry
+        DataPtr += sizeof(Entry);
+        memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
+      }
+    }
+    return llvm::make_error<llvm::StringError>("Unable to read databuffer.",
+                                               llvm::errc::io_error);
+  }
+
+  void patchBasicBlockToEndBenchmarkedLoop(char *pc) const {
+    auto prot = PROT_READ | PROT_WRITE | PROT_EXEC;
+    auto page = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(pc) &
+                                         ~(getpagesize() - 1));
+    mprotect(page, getpagesize(), prot);
+    // Update the last jump back instruction, pc is the start of jump
+    // instruction.
+    if ((reinterpret_cast<uint64_t>(pc) & 0xf) != 0xf) {
+      // We have at least two bytes in the same cacheline, so we can use an
+      // atomic write to replace the jmp with pop, ret.
+      *reinterpret_cast<uint16_t *>(pc) = 0xc35b; // pop %rbx; ret;
+    } else {
+      // pc and pc+1 might be at different cacheline, so we first add the pop,
+      // ret instructions, and then update jump offset to jump to pop
+      // instruction.
+      *reinterpret_cast<uint16_t *>(pc + 5) = 0xc35b; // pop %rbx; ret;
+      // An automic update on the displacement.
+      if (pc[0] == 0xeb /* x86 short jump */) {
+        // offset is 1 byte
+        pc[1] = 0x03;
+      } else { /* x86 near jump */
+        // offset is 4 bytes
+        *reinterpret_cast<uint32_t *>(pc + 1) = 0;
+      }
+    }
+  }
+
+  void *MMappedBuffer = nullptr;
+  size_t DataBufferSize;
+  static constexpr int kBufferPages = 8;
+};
+#endif // HAVE_LIBPFM
+
 class ExegesisX86Target : public ExegesisTarget {
 public:
   ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
 
+  Expected<pfm::Counter>
+  createCounter(const pfm::PerfEvent &Event) const override {
+#ifdef HAVE_LIBPFM
+    // Can't use LBR without LIB PFM.
+    if (Event.useLbr())
+      return X86LbrCounter(Event);
+#endif
+    return pfm::Counter(Event);
+  }
+
 private:
   void addTargetSpecificPasses(PassManagerBase &PM) const override;
 
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -73,6 +73,8 @@
     "mode", cl::desc("the mode to run"), cl::cat(Options),
     cl::values(clEnumValN(exegesis::InstructionBenchmark::Latency, "latency",
                           "Instruction Latency"),
+               clEnumValN(exegesis::InstructionBenchmark::LbrLatency,
+                          "lbr_latency", "Instruction Latency using LBR"),
                clEnumValN(exegesis::InstructionBenchmark::InverseThroughput,
                           "inverse_throughput",
                           "Instruction Inverse Throughput"),
@@ -83,6 +85,11 @@
                clEnumValN(exegesis::InstructionBenchmark::Unknown, "analysis",
                           "Analysis")));
 
+static cl::opt<unsigned> LbrSamplePeriod(
+    "lbr-sample-period",
+    cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
+    cl::cat(BenchmarkOptions), cl::init(521)); // 521 chosen based on GWP
+
 static cl::opt<exegesis::InstructionBenchmark::RepetitionModeE> RepetitionMode(
     "repetition-mode", cl::desc("how to repeat the instruction snippet"),
     cl::cat(BenchmarkOptions),
@@ -332,6 +339,13 @@
     Configurations = ExitOnErr(readSnippets(State, SnippetsFile));
   }
 
+#if !defined(__x86_64__) && !defined(__x86__) && !defined(__i386__)
+  if (exegesis::BenchmarkMode == exegesis::InstructionBenchmark::LbrLatency) {
+    ExitOnErr.setBanner("llvm-exegesis: ");
+    ExitWithError("LBR mode must be run on x86 arch.");
+  }
+#endif
+
   if (NumRepetitions == 0) {
     ExitOnErr.setBanner("llvm-exegesis: ");
     ExitWithError("--num-repetitions must be greater than zero");
@@ -342,8 +356,10 @@
     BenchmarkFile = "-";
 
   for (const BenchmarkCode &Conf : Configurations) {
-    InstructionBenchmark Result = ExitOnErr(Runner->runConfiguration(
-        Conf, NumRepetitions, Repetitors, DumpObjectToDisk));
+    BenchmarkRunner::RunArg RunArg{Conf, NumRepetitions, Repetitors,
+                                   DumpObjectToDisk, LbrSamplePeriod};
+
+    InstructionBenchmark Result = ExitOnErr(Runner->runConfiguration(RunArg));
     ExitOnFileError(BenchmarkFile, Result.writeYaml(State, BenchmarkFile));
   }
   exegesis::pfm::pfmTerminate();