diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -41,7 +41,9 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ThreadPool.h"
 #include <algorithm>
+#include <iterator>
 #include <string>
 
 namespace llvm {
@@ -242,6 +244,18 @@
              "and prints a message to access it (default = false)"),
     cl::cat(BenchmarkOptions), cl::init(false));
 
+static cl::opt<unsigned>
+    ThreadCount("j",
+                cl::desc("The number of threads to use for parallel operations "
+                         "(default = 0 (autodetect))"),
+                cl::cat(Options), cl::init(0));
+
+static cl::opt<unsigned> PerThreadBatchSize(
+    "thread-batch-size",
+    cl::desc("The batch size for parallel operations as it is not efficient to "
+             "run one task per thread (default = 0 (autodetect))"),
+    cl::cat(Options), cl::init(0));
+
 static ExitOnError ExitOnErr("llvm-exegesis error: ");
 
 // Helper function that logs the error(s) and exits.
@@ -348,11 +362,151 @@
   return Benchmarks;
 }
 
+static size_t GetNumConfigurationsPerBatch(const ThreadPool &Pool,
+                                           unsigned NumRepetitors) {
+  // We default to the "thread-batch-size" option.
+  size_t N = PerThreadBatchSize;
+  if (N == 0) // autodetect - just use thread count, a good-enough default.
+    N = Pool.getThreadCount();
+
+  // "thread-batch-size" option is specified per-thread,
+  // so multiply by the actual thread count.
+  N = SaturatingMultiply<size_t>(N, Pool.getThreadCount());
+
+  // Also, each configuration runs for each repetitor,
+  // and we don't want the number of repetitors to affect
+  // the amount of work a single batch contains,
+  // so just divide by the number of repetitors.
+  N = divideCeil(N, NumRepetitors);
+
+  assert(N > 0 && "Not processing anything!");
+  return N;
+}
+
+using ExpectedRunnableConfiguration =
+    std::optional<Expected<BenchmarkRunner::RunnableConfiguration>>;
+static constexpr int MaxRepetitors = 2;
+
+static void computeBatch(
+    std::optional<ProgressMeter<>> &Meter,
+    ArrayRef<BenchmarkCode> &Configurations, size_t NumConfigurationsPerBatch,
+    SmallVectorImpl<SmallVector<ExpectedRunnableConfiguration, MaxRepetitors>>
+        &PerConfigRCs,
+    ArrayRef<std::unique_ptr<const SnippetRepetitor>> Repetitors,
+    ThreadPool &Pool, const BenchmarkRunner &Runner) {
+  // Onto next batch.
+  PerConfigRCs.clear();
+
+  // In each iteration, we deal with NumConfigurationsPerBatch-sized chunks.
+  ProgressMeter<>::ProgressMeterStep MeterStep(Meter ? &*Meter : nullptr);
+  ArrayRef<BenchmarkCode> ConfigurationBatch =
+      Configurations.take_front(NumConfigurationsPerBatch);
+  Configurations = Configurations.drop_front(ConfigurationBatch.size());
+
+  // For each configuration in batch:
+  PerConfigRCs.resize(ConfigurationBatch.size());
+  for (auto C : zip(ConfigurationBatch, PerConfigRCs)) {
+    const BenchmarkCode &BC = std::get<0>(C);
+    SmallVectorImpl<ExpectedRunnableConfiguration> &RCsOfConfiguration =
+        std::get<1>(C);
+
+    // For each configured repetitor:
+    RCsOfConfiguration.resize(Repetitors.size());
+    for (auto R : zip(Repetitors, RCsOfConfiguration)) {
+      const SnippetRepetitor &Repetitor = *std::get<0>(R);
+      ExpectedRunnableConfiguration *Storage = &std::get<1>(R);
+      // Prepare an output slot for the task, without invalidating iterators.
+      // Create asyncronous task to generage Runnable Configuration
+      // for this configuration given this repetitor. This is thread-safe.
+      // NOTE: this does not run any measurements. This is codegen-only!
+      // NOTE: the task output into predetermined storage,
+      //       which is in deterministic order.
+      Pool.async([BC, &Repetitor, &Runner, Storage]() {
+        *Storage = Runner.getRunnableConfiguration(BC, NumRepetitions,
+                                                   LoopBodySize, Repetitor);
+      });
+    }
+  }
+
+  // We've scheduled all codegen tasks for all configurations X repetitions.
+  // Now, let's wait until they *ALL* complete.
+  Pool.wait();
+}
+
+static void runOneConfiguration(
+    const LLVMState &State, raw_ostream &Ostr,
+    MutableArrayRef<ExpectedRunnableConfiguration> RCsOfConfiguration,
+    const BenchmarkRunner &Runner) {
+  // And they've completed! Now, for each configuration in this batch:
+  SmallVector<InstructionBenchmark, MaxRepetitors> AllResults;
+  assert(RCsOfConfiguration.size() <= MaxRepetitors);
+  AllResults.reserve(RCsOfConfiguration.size());
+  // For each Runnable Configuration per repetitor:
+  for (ExpectedRunnableConfiguration &&ERC :
+       make_range(std::make_move_iterator(RCsOfConfiguration.begin()),
+                  std::make_move_iterator(RCsOfConfiguration.end()))) {
+    assert(ERC && "The task did finish.");
+    auto RC = ExitOnErr(std::move(*ERC));
+    // Now, actually run the final generated snippet, and measure it!
+    // NOTE: this is being done completely stand-alone and not in Pool!
+    AllResults.emplace_back(
+        ExitOnErr(Runner.runConfiguration(std::move(RC), DumpObjectToDisk)));
+  }
+  InstructionBenchmark &Result = AllResults.front();
+
+  // If any of our measurements failed, pretend they all have failed.
+  if (AllResults.size() > 1 &&
+      any_of(AllResults, [](const InstructionBenchmark &R) {
+        return R.Measurements.empty();
+      }))
+    Result.Measurements.clear();
+
+  if (RepetitionMode == InstructionBenchmark::RepetitionModeE::AggregateMin) {
+    for (const InstructionBenchmark &OtherResult :
+         ArrayRef<InstructionBenchmark>(AllResults).drop_front()) {
+      llvm::append_range(Result.AssembledSnippet, OtherResult.AssembledSnippet);
+      // Aggregate measurements, but only iff all measurements succeeded.
+      if (Result.Measurements.empty())
+        continue;
+      assert(OtherResult.Measurements.size() == Result.Measurements.size() &&
+             "Expected to have identical number of measurements.");
+      for (auto I : zip(Result.Measurements, OtherResult.Measurements)) {
+        BenchmarkMeasure &Measurement = std::get<0>(I);
+        const BenchmarkMeasure &NewMeasurement = std::get<1>(I);
+        assert(Measurement.Key == NewMeasurement.Key &&
+               "Expected measurements to be symmetric");
+
+        Measurement.PerInstructionValue =
+            std::min(Measurement.PerInstructionValue,
+                     NewMeasurement.PerInstructionValue);
+        Measurement.PerSnippetValue = std::min(Measurement.PerSnippetValue,
+                                               NewMeasurement.PerSnippetValue);
+      }
+    }
+  }
+  // And output the results.
+  // NOTE: the order is deterministic!
+  ExitOnFileError(BenchmarkFile, Result.writeYamlTo(State, Ostr));
+}
+
+static void runBatch(
+    const LLVMState &State, raw_ostream &Ostr,
+    MutableArrayRef<SmallVector<ExpectedRunnableConfiguration, MaxRepetitors>>
+        PerConfigRCs,
+    const BenchmarkRunner &Runner) {
+  // And they've completed! Now, for each configuration in this batch:
+  for (MutableArrayRef<ExpectedRunnableConfiguration> RCsOfConfiguration :
+       PerConfigRCs)
+    runOneConfiguration(State, Ostr, RCsOfConfiguration, Runner);
+}
+
 static void runBenchmarkConfigurations(
     const LLVMState &State, ArrayRef<BenchmarkCode> Configurations,
     ArrayRef<std::unique_ptr<const SnippetRepetitor>> Repetitors,
     const BenchmarkRunner &Runner) {
   assert(!Configurations.empty() && "Don't have any configurations to run.");
+  assert(!Repetitors.empty() && Repetitors.size() <= MaxRepetitors &&
+         "Unexpected Repetitor count.");
   std::optional<raw_fd_ostream> FileOstr;
   if (BenchmarkFile != "-") {
     int ResultFD = 0;
@@ -364,55 +518,26 @@
   }
   raw_ostream &Ostr = FileOstr ? *FileOstr : outs();
 
+  ThreadPool Pool(hardware_concurrency(ThreadCount));
+
+  SmallVector<SmallVector<ExpectedRunnableConfiguration, MaxRepetitors>, 1>
+      PerConfigRCs;
+
+  size_t NumConfigurationsPerBatch =
+      GetNumConfigurationsPerBatch(Pool, Repetitors.size());
+  assert(NumConfigurationsPerBatch > 0 && "Not processing anything!");
+
+  PerConfigRCs.reserve(
+      std::min<unsigned>(NumConfigurationsPerBatch, Configurations.size()));
+
   std::optional<ProgressMeter<>> Meter;
   if (BenchmarkMeasurementsPrintProgress)
-    Meter.emplace(Configurations.size());
-  for (const BenchmarkCode &Conf : Configurations) {
-    ProgressMeter<>::ProgressMeterStep MeterStep(Meter ? &*Meter : nullptr);
-    SmallVector<InstructionBenchmark, 2> AllResults;
-
-    for (const std::unique_ptr<const SnippetRepetitor> &Repetitor :
-         Repetitors) {
-      auto RC = ExitOnErr(Runner.getRunnableConfiguration(
-          Conf, NumRepetitions, LoopBodySize, *Repetitor));
-      AllResults.emplace_back(
-          ExitOnErr(Runner.runConfiguration(std::move(RC), DumpObjectToDisk)));
-    }
-    InstructionBenchmark &Result = AllResults.front();
-
-    // If any of our measurements failed, pretend they all have failed.
-    if (AllResults.size() > 1 &&
-        any_of(AllResults, [](const InstructionBenchmark &R) {
-          return R.Measurements.empty();
-        }))
-      Result.Measurements.clear();
-
-    if (RepetitionMode == InstructionBenchmark::RepetitionModeE::AggregateMin) {
-      for (const InstructionBenchmark &OtherResult :
-           ArrayRef<InstructionBenchmark>(AllResults).drop_front()) {
-        llvm::append_range(Result.AssembledSnippet,
-                           OtherResult.AssembledSnippet);
-        // Aggregate measurements, but only iff all measurements succeeded.
-        if (Result.Measurements.empty())
-          continue;
-        assert(OtherResult.Measurements.size() == Result.Measurements.size() &&
-               "Expected to have identical number of measurements.");
-        for (auto I : zip(Result.Measurements, OtherResult.Measurements)) {
-          BenchmarkMeasure &Measurement = std::get<0>(I);
-          const BenchmarkMeasure &NewMeasurement = std::get<1>(I);
-          assert(Measurement.Key == NewMeasurement.Key &&
-                 "Expected measurements to be symmetric");
-
-          Measurement.PerInstructionValue =
-              std::min(Measurement.PerInstructionValue,
-                       NewMeasurement.PerInstructionValue);
-          Measurement.PerSnippetValue = std::min(
-              Measurement.PerSnippetValue, NewMeasurement.PerSnippetValue);
-        }
-      }
-    }
-
-    ExitOnFileError(BenchmarkFile, Result.writeYamlTo(State, Ostr));
+    Meter.emplace(divideCeil(Configurations.size(), NumConfigurationsPerBatch));
+  // Outermost loop: run until we've processed all configurations.
+  while (!Configurations.empty()) {
+    computeBatch(Meter, Configurations, NumConfigurationsPerBatch, PerConfigRCs,
+                 Repetitors, Pool, Runner);
+    runBatch(State, Ostr, PerConfigRCs, Runner);
   }
 }