diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst
--- a/llvm/docs/CommandGuide/llvm-exegesis.rst
+++ b/llvm/docs/CommandGuide/llvm-exegesis.rst
@@ -189,7 +189,8 @@
 
  `latency` mode can be  make use of either RDTSC or LBR.
  `latency[LBR]` is only available on X86 (at least `Skylake`).
- To run in `latency` mode, a positive value must be specified for `x86-lbr-sample-period` and `--repetition-mode=loop`.
+ To run in `latency` mode, a positive value must be specified
+ for `x86-lbr-sample-period` and `--repetition-mode=loop`.
 
  In `analysis` mode, you also need to specify at least one of the
  `-analysis-clusters-output-file=` and `-analysis-inconsistencies-output-file=`.
@@ -202,23 +203,33 @@
   On choosing the "right" sampling period, a small value is preferred, but throttling
   could occur if the sampling is too frequent. A prime number should be used to
   avoid consistently skipping certain blocks.
-  
+
 .. option:: -repetition-mode=[duplicate|loop|min]
 
  Specify the repetition mode. `duplicate` will create a large, straight line
- basic block with `num-repetitions` copies of the snippet. `loop` will wrap
- the snippet in a loop which will be run `num-repetitions` times. The `loop`
- mode tends to better hide the effects of the CPU frontend on architectures
+ basic block with `num-repetitions` instructions (repeating the snippet
+ `num-repetitions`/`snippet size` times). `loop` will, optionally, unroll the
+ snippet `loop-unroll-factor` times, and then wrap the result in a loop which
+ will execute `num-repetitions` instructions (thus, again, repeating the snippet
+ `num-repetitions`/`snippet size` times). The `loop` mode, especially with loop
+ unrolling tends to better hide the effects of the CPU frontend on architectures
  that cache decoded instructions, but consumes a register for counting
- iterations. If performing an analysis over many opcodes, it may be best
- to instead use the `min` mode, which will run each other mode, and produce
- the minimal measured result.
+ iterations. If performing an analysis over many opcodes, it may be best to
+ instead use the `min` mode, which will run each other mode, and produce the
+ minimal measured result.
 
 .. option:: -num-repetitions=<Number of repetitions>
 
  Specify the number of repetitions of the asm snippet.
  Higher values lead to more accurate measurements but lengthen the benchmark.
 
+.. option:: -loop-unroll-factor=<Number of times to unroll the loop>
+
+ Only effective for `-repetition-mode=[loop|min]`.
+ Instead of repeating the snippet directly, first unroll it this many times.
+ This allows to make better use of CPU Op Cache, which may have higher
+ throughput than the CPU's decoders.
+
 .. option:: -max-configs-per-opcode=<value>
 
  Specify the maximum configurations that can be generated for each opcode.
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
@@ -67,7 +67,8 @@
   const MCInst &keyInstruction() const { return Key.Instructions[0]; }
   // The number of instructions inside the repeated snippet. For example, if a
   // snippet of 3 instructions is repeated 4 times, this is 12.
-  int NumRepetitions = 0;
+  unsigned NumRepetitions = 0;
+  unsigned LoopUnrollFactor = 0;
   enum RepetitionModeE { Duplicate, Loop, AggregateMin };
   // Note that measurements are per instruction.
   std::vector<BenchmarkMeasure> Measurements;
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
@@ -41,6 +41,7 @@
 
   Expected<InstructionBenchmark>
   runConfiguration(const BenchmarkCode &Configuration, unsigned NumRepetitions,
+                   unsigned LoopUnrollFactor,
                    ArrayRef<std::unique_ptr<const SnippetRepetitor>> Repetitors,
                    bool DumpObjectToDisk) const;
 
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -133,7 +133,7 @@
 } // namespace
 
 Expected<InstructionBenchmark> BenchmarkRunner::runConfiguration(
-    const BenchmarkCode &BC, unsigned NumRepetitions,
+    const BenchmarkCode &BC, unsigned NumRepetitions, unsigned LoopUnrollFactor,
     ArrayRef<std::unique_ptr<const SnippetRepetitor>> Repetitors,
     bool DumpObjectToDisk) const {
   InstructionBenchmark InstrBenchmark;
@@ -142,6 +142,7 @@
   InstrBenchmark.LLVMTriple =
       State.getTargetMachine().getTargetTriple().normalize();
   InstrBenchmark.NumRepetitions = NumRepetitions;
+  InstrBenchmark.LoopUnrollFactor = LoopUnrollFactor;
   InstrBenchmark.Info = BC.Info;
 
   const std::vector<MCInst> &Instructions = BC.Key.Instructions;
@@ -169,13 +170,15 @@
     // the snippet for debug/analysis. This is so that the user clearly
     // understands that the inside instructions are repeated.
     constexpr const int kMinInstructionsForSnippet = 16;
+    constexpr const int kLoopUnrollFactorForSnippet = 2;
     {
       SmallString<0> Buffer;
       raw_svector_ostream OS(Buffer);
       if (Error E = assembleToStream(
               State.getExegesisTarget(), State.createTargetMachine(),
               BC.LiveIns, BC.Key.RegisterInitialValues,
-              Repetitor->Repeat(Instructions, kMinInstructionsForSnippet),
+              Repetitor->Repeat(Instructions, kMinInstructionsForSnippet,
+                                kLoopUnrollFactorForSnippet),
               OS)) {
         return std::move(E);
       }
@@ -188,7 +191,8 @@
     // Assemble NumRepetitions instructions repetitions of the snippet for
     // measurements.
     const auto Filler =
-        Repetitor->Repeat(Instructions, InstrBenchmark.NumRepetitions);
+        Repetitor->Repeat(Instructions, InstrBenchmark.NumRepetitions,
+                          InstrBenchmark.LoopUnrollFactor);
 
     object::OwningBinary<object::ObjectFile> ObjectFile;
     if (DumpObjectToDisk) {
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
@@ -39,7 +39,8 @@
   // Returns a functor that repeats `Instructions` so that the function executes
   // at least `MinInstructions` instructions.
   virtual FillFunction Repeat(ArrayRef<MCInst> Instructions,
-                              unsigned MinInstructions) const = 0;
+                              unsigned MinInstructions,
+                              unsigned LoopUnrollFactor) const = 0;
 
   explicit SnippetRepetitor(const LLVMState &State) : State(State) {}
 
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
@@ -11,6 +11,7 @@
 
 #include "SnippetRepetitor.h"
 #include "Target.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 
@@ -24,8 +25,8 @@
 
   // Repeats the snippet until there are at least MinInstructions in the
   // resulting code.
-  FillFunction Repeat(ArrayRef<MCInst> Instructions,
-                      unsigned MinInstructions) const override {
+  FillFunction Repeat(ArrayRef<MCInst> Instructions, unsigned MinInstructions,
+                      unsigned LoopUnrollFactor) const override {
     return [Instructions, MinInstructions](FunctionFiller &Filler) {
       auto Entry = Filler.getEntry();
       if (!Instructions.empty()) {
@@ -53,17 +54,19 @@
             State.getTargetMachine().getTargetTriple())) {}
 
   // Loop over the snippet ceil(MinInstructions / Instructions.Size()) times.
-  FillFunction Repeat(ArrayRef<MCInst> Instructions,
-                      unsigned MinInstructions) const override {
-    return [this, Instructions, MinInstructions](FunctionFiller &Filler) {
+  FillFunction Repeat(ArrayRef<MCInst> Instructions, unsigned MinInstructions,
+                      unsigned LoopUnrollFactor) const override {
+    return [this, Instructions, MinInstructions,
+            LoopUnrollFactor](FunctionFiller &Filler) {
       const auto &ET = State.getExegesisTarget();
       auto Entry = Filler.getEntry();
       auto Loop = Filler.addBasicBlock();
       auto Exit = Filler.addBasicBlock();
 
       // Set loop counter to the right value:
-      const APInt LoopCount(32, (MinInstructions + Instructions.size() - 1) /
-                                    Instructions.size());
+      const APInt LoopCount(
+          32,
+          divideCeil(MinInstructions, LoopUnrollFactor * Instructions.size()));
       for (const MCInst &Inst :
            ET.setRegTo(State.getSubtargetInfo(), LoopCounter, LoopCount))
         Entry.addInstruction(Inst);
@@ -78,7 +81,10 @@
         Loop.MBB->addLiveIn(Reg);
       for (const auto &LiveIn : Entry.MBB->liveins())
         Loop.MBB->addLiveIn(LiveIn);
-      Loop.addInstructions(Instructions);
+      for (auto _ : seq(0U, LoopUnrollFactor)) {
+        (void)_;
+        Loop.addInstructions(Instructions);
+      }
       ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB,
                                      State.getInstrInfo());
 
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -116,6 +116,12 @@
                    cl::desc("number of time to repeat the asm snippet"),
                    cl::cat(BenchmarkOptions), cl::init(10000));
 
+static cl::opt<unsigned> LoopUnrollFactor(
+    "loop-unroll-factor",
+    cl::desc("when repeating the instruction snippet by looping over it, "
+             "unroll the loop body this many times"),
+    cl::cat(BenchmarkOptions), cl::init(0));
+
 static cl::opt<unsigned> MaxConfigsPerOpcode(
     "max-configs-per-opcode",
     cl::desc(
@@ -365,7 +371,8 @@
 
   for (const BenchmarkCode &Conf : Configurations) {
     InstructionBenchmark Result = ExitOnErr(Runner->runConfiguration(
-        Conf, NumRepetitions, Repetitors, DumpObjectToDisk));
+        Conf, NumRepetitions, std::max<unsigned>(LoopUnrollFactor, 1U),
+        Repetitors, DumpObjectToDisk));
     ExitOnFileError(BenchmarkFile, Result.writeYaml(State, BenchmarkFile));
   }
   exegesis::pfm::pfmTerminate();
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
@@ -42,11 +42,13 @@
     const auto Repetitor = SnippetRepetitor::Create(RepetitionMode, State);
     const std::vector<MCInst> Instructions = {MCInstBuilder(X86::NOOP)};
     FunctionFiller Sink(*MF, {X86::EAX});
-    const auto Fill = Repetitor->Repeat(Instructions, kMinInstructions);
+    const auto Fill =
+        Repetitor->Repeat(Instructions, kMinInstructions, LoopUnrollFactor);
     Fill(Sink);
   }
 
   static constexpr const unsigned kMinInstructions = 3;
+  static constexpr const unsigned LoopUnrollFactor = 2;
 
   std::unique_ptr<LLVMTargetMachine> TM;
   std::unique_ptr<LLVMContext> Context;
@@ -78,8 +80,8 @@
   ASSERT_EQ(MF->getNumBlockIDs(), 3u);
   const auto &LoopBlock = *MF->getBlockNumbered(1);
   EXPECT_THAT(LoopBlock.instrs(),
-              ElementsAre(HasOpcode(X86::NOOP), HasOpcode(X86::ADD64ri8),
-                          HasOpcode(X86::JCC_1)));
+              ElementsAre(HasOpcode(X86::NOOP), HasOpcode(X86::NOOP),
+                          HasOpcode(X86::ADD64ri8), HasOpcode(X86::JCC_1)));
   EXPECT_THAT(LoopBlock.liveins(),
               UnorderedElementsAre(
                   LiveReg(X86::EAX),