diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -41,7 +41,9 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ThreadPool.h"
 #include <algorithm>
+#include <iterator>
 #include <string>
 
 namespace llvm {
@@ -242,6 +244,18 @@
              "and prints a message to access it (default = false)"),
     cl::cat(BenchmarkOptions), cl::init(false));
 
+static cl::opt<unsigned>
+    ThreadCount("j",
+                cl::desc("The number of threads to use for parallel operations "
+                         "(default = 0 (autodetect))"),
+                cl::cat(Options), cl::init(0));
+
+static cl::opt<unsigned> PerThreadBatchSize(
+    "thread-batch-size",
+    cl::desc("The batch size for parallel operations as it is not efficient to "
+             "run one task per thread (default = 0 (autodetect))"),
+    cl::cat(Options), cl::init(0));
+
 static ExitOnError ExitOnErr("llvm-exegesis error: ");
 
 // Helper function that logs the error(s) and exits.
@@ -348,33 +362,97 @@
   return Benchmarks;
 }
 
-static void runBenchmarkConfigurations(
-    const LLVMState &State, ArrayRef<BenchmarkCode> Configurations,
+static size_t GetNumConfigurationsPerBatch(const ThreadPool &Pool,
+                                           unsigned NumRepetitors) {
+  // We default to the "thread-batch-size" option.
+  size_t N = PerThreadBatchSize;
+  if (N == 0) // autodetect - just use thread count, a good-enough default.
+    N = Pool.getThreadCount();
+
+  // "thread-batch-size" option is specified per-thread,
+  // so multiply by the actual thread count.
+  N = SaturatingMultiply<size_t>(N, Pool.getThreadCount());
+
+  // Also, each configuration runs for each repetitor,
+  // and we don't want the number of repetitors to affect
+  // the amount of work a single batch contains,
+  // so just divide by the number of repetitors.
+  N = divideCeil(N, NumRepetitors);
+
+  assert(N > 0 && "Not processing anything!");
+  return N;
+}
+
+using ExpectedRunnableConfiguration =
+    std::optional<Expected<BenchmarkRunner::RunnableConfiguration>>;
+static constexpr int MaxRepetitors = 2;
+
+static void computeBatch(
+    std::optional<ProgressMeter<>> &Meter,
+    ArrayRef<BenchmarkCode> &Configurations, size_t NumConfigurationsPerBatch,
+    SmallVectorImpl<SmallVector<ExpectedRunnableConfiguration, MaxRepetitors>>
+        &PerConfigRCs,
     ArrayRef<std::unique_ptr<const SnippetRepetitor>> Repetitors,
-    const BenchmarkRunner &Runner) {
-  assert(!Configurations.empty() && "Don't have any configurations to run.");
-  std::optional<raw_fd_ostream> FileOstr;
-  if (BenchmarkFile != "-") {
-    int ResultFD = 0;
-    // Create output file or open existing file and truncate it, once.
-    ExitOnErr(errorCodeToError(openFileForWrite(BenchmarkFile, ResultFD,
-                                                sys::fs::CD_CreateAlways,
-                                                sys::fs::OF_TextWithCRLF)));
-    FileOstr.emplace(ResultFD, true /*shouldClose*/);
+    ThreadPool &Pool, const BenchmarkRunner &Runner) {
+  // Onto next batch.
+  PerConfigRCs.clear();
+
+  // In each iteration, we deal with NumConfigurationsPerBatch-sized chunks.
+  ProgressMeter<>::ProgressMeterStep MeterStep(Meter ? &*Meter : nullptr);
+  ArrayRef<BenchmarkCode> ConfigurationBatch =
+      Configurations.take_front(NumConfigurationsPerBatch);
+  Configurations = Configurations.drop_front(ConfigurationBatch.size());
+
+  // For each configuration in batch:
+  PerConfigRCs.resize(ConfigurationBatch.size());
+  for (auto C : zip(ConfigurationBatch, PerConfigRCs)) {
+    const BenchmarkCode &BC = std::get<0>(C);
+    SmallVectorImpl<ExpectedRunnableConfiguration> &RCsOfConfiguration =
+        std::get<1>(C);
+
+    // For each configured repetitor:
+    RCsOfConfiguration.resize(Repetitors.size());
+    for (auto R : zip(Repetitors, RCsOfConfiguration)) {
+      const SnippetRepetitor &Repetitor = *std::get<0>(R);
+      ExpectedRunnableConfiguration *Storage = &std::get<1>(R);
+      // Prepare an output slot for the task, without invalidating iterators.
+      // Create asyncronous task to generage Runnable Configuration
+      // for this configuration given this repetitor. This is thread-safe.
+      // NOTE: this does not run any measurements. This is codegen-only!
+      // NOTE: the task output into predetermined storage,
+      //       which is in deterministic order.
+      Pool.async([BC, &Repetitor, &Runner, Storage]() {
+        *Storage = Runner.getRunnableConfiguration(BC, NumRepetitions,
+                                                   LoopBodySize, Repetitor);
+      });
+    }
   }
-  raw_ostream &Ostr = FileOstr ? *FileOstr : outs();
 
-  std::optional<ProgressMeter<>> Meter;
-  if (BenchmarkMeasurementsPrintProgress)
-    Meter.emplace(Configurations.size());
-  for (const BenchmarkCode &Conf : Configurations) {
-    ProgressMeter<>::ProgressMeterStep MeterStep(Meter ? &*Meter : nullptr);
-    SmallVector<InstructionBenchmark, 2> AllResults;
+  // We've scheduled all codegen tasks for all configurations X repetitions.
+  // Now, let's wait until they *ALL* complete.
+  Pool.wait();
+}
 
-    for (const std::unique_ptr<const SnippetRepetitor> &Repetitor :
-         Repetitors) {
-      auto RC = ExitOnErr(Runner.getRunnableConfiguration(
-          Conf, NumRepetitions, LoopBodySize, *Repetitor));
+static void runBatch(
+    const LLVMState &State, raw_ostream &Ostr,
+    MutableArrayRef<SmallVector<ExpectedRunnableConfiguration, MaxRepetitors>>
+        PerConfigRCs,
+    const BenchmarkRunner &Runner) {
+  // And they've completed! Now, for each configuration in this batch:
+  SmallVector<InstructionBenchmark, MaxRepetitors> AllResults;
+  for (MutableArrayRef<ExpectedRunnableConfiguration> RCsOfConfiguration :
+       PerConfigRCs) {
+    assert(RCsOfConfiguration.size() <= MaxRepetitors);
+    AllResults.clear();
+    AllResults.reserve(RCsOfConfiguration.size());
+    // For each Runnable Configuration per repetitor:
+    for (ExpectedRunnableConfiguration &&ERC :
+         make_range(std::make_move_iterator(RCsOfConfiguration.begin()),
+                    std::make_move_iterator(RCsOfConfiguration.end()))) {
+      assert(ERC);
+      auto RC = ExitOnErr(std::move(*ERC));
+      // Now, actually run the final generated snippet, and measure it!
+      // NOTE: this is being done completely stand-alone and not in Pool!
       AllResults.emplace_back(
           ExitOnErr(Runner.runConfiguration(std::move(RC), DumpObjectToDisk)));
     }
@@ -411,11 +489,53 @@
         }
       }
     }
-
+    // And output the results.
+    // NOTE: the order is deterministic!
     ExitOnFileError(BenchmarkFile, Result.writeYamlTo(State, Ostr));
   }
 }
 
+static void runBenchmarkConfigurations(
+    const LLVMState &State, ArrayRef<BenchmarkCode> Configurations,
+    ArrayRef<std::unique_ptr<const SnippetRepetitor>> Repetitors,
+    const BenchmarkRunner &Runner) {
+  assert(!Configurations.empty() && "Don't have any configurations to run.");
+  assert(!Repetitors.empty() && Repetitors.size() <= MaxRepetitors &&
+         "Unexpected Repetitor count.");
+  std::optional<raw_fd_ostream> FileOstr;
+  if (BenchmarkFile != "-") {
+    int ResultFD = 0;
+    // Create output file or open existing file and truncate it, once.
+    ExitOnErr(errorCodeToError(openFileForWrite(BenchmarkFile, ResultFD,
+                                                sys::fs::CD_CreateAlways,
+                                                sys::fs::OF_TextWithCRLF)));
+    FileOstr.emplace(ResultFD, true /*shouldClose*/);
+  }
+  raw_ostream &Ostr = FileOstr ? *FileOstr : outs();
+
+  ThreadPool Pool(hardware_concurrency(ThreadCount));
+
+  SmallVector<SmallVector<ExpectedRunnableConfiguration, MaxRepetitors>, 1>
+      PerConfigRCs;
+
+  size_t NumConfigurationsPerBatch =
+      GetNumConfigurationsPerBatch(Pool, Repetitors.size());
+  assert(NumConfigurationsPerBatch > 0 && "Not processing anything!");
+
+  PerConfigRCs.reserve(
+      std::min<unsigned>(NumConfigurationsPerBatch, Configurations.size()));
+
+  std::optional<ProgressMeter<>> Meter;
+  if (BenchmarkMeasurementsPrintProgress)
+    Meter.emplace(divideCeil(Configurations.size(), NumConfigurationsPerBatch));
+  // Outermost loop: run until we've processed all configurations.
+  while (!Configurations.empty()) {
+    computeBatch(Meter, Configurations, NumConfigurationsPerBatch, PerConfigRCs,
+                 Repetitors, Pool, Runner);
+    runBatch(State, Ostr, PerConfigRCs, Runner);
+  }
+}
+
 void benchmarkMain() {
   if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure) {
 #ifndef HAVE_LIBPFM