diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst --- a/llvm/docs/CommandGuide/llvm-exegesis.rst +++ b/llvm/docs/CommandGuide/llvm-exegesis.rst @@ -189,7 +189,8 @@ `latency` mode can be make use of either RDTSC or LBR. `latency[LBR]` is only available on X86 (at least `Skylake`). - To run in `latency` mode, a positive value must be specified for `x86-lbr-sample-period` and `--repetition-mode=loop`. + To run in `latency` mode, a positive value must be specified + for `x86-lbr-sample-period` and `--repetition-mode=loop`. In `analysis` mode, you also need to specify at least one of the `-analysis-clusters-output-file=` and `-analysis-inconsistencies-output-file=`. @@ -202,23 +203,33 @@ On choosing the "right" sampling period, a small value is preferred, but throttling could occur if the sampling is too frequent. A prime number should be used to avoid consistently skipping certain blocks. - + .. option:: -repetition-mode=[duplicate|loop|min] Specify the repetition mode. `duplicate` will create a large, straight line - basic block with `num-repetitions` copies of the snippet. `loop` will wrap - the snippet in a loop which will be run `num-repetitions` times. The `loop` - mode tends to better hide the effects of the CPU frontend on architectures + basic block with `num-repetitions` instructions (repeating the snippet + `num-repetitions`/`snippet size` times). `loop` will, optionally, unroll the + snippet `loop-unroll-factor` times, and then wrap the result in a loop which + will execute `num-repetitions` instructions (thus, again, repeating the snippet + `num-repetitions`/`snippet size` times). The `loop` mode, especially with loop + unrolling tends to better hide the effects of the CPU frontend on architectures that cache decoded instructions, but consumes a register for counting - iterations. If performing an analysis over many opcodes, it may be best - to instead use the `min` mode, which will run each other mode, and produce - the minimal measured result. + iterations. If performing an analysis over many opcodes, it may be best to + instead use the `min` mode, which will run each other mode, and produce the + minimal measured result. .. option:: -num-repetitions= Specify the number of repetitions of the asm snippet. Higher values lead to more accurate measurements but lengthen the benchmark. +.. option:: -loop-unroll-factor= + + Only effective for `-repetition-mode=[loop|min]`. + Instead of repeating the snippet directly, first unroll it this many times. + This allows to make better use of CPU Op Cache, which may have higher + throughput than the CPU's decoders. + .. option:: -max-configs-per-opcode= Specify the maximum configurations that can be generated for each opcode. diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h --- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h @@ -67,7 +67,8 @@ const MCInst &keyInstruction() const { return Key.Instructions[0]; } // The number of instructions inside the repeated snippet. For example, if a // snippet of 3 instructions is repeated 4 times, this is 12. - int NumRepetitions = 0; + unsigned NumRepetitions = 0; + unsigned LoopUnrollFactor = 0; enum RepetitionModeE { Duplicate, Loop, AggregateMin }; // Note that measurements are per instruction. std::vector Measurements; diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h @@ -41,6 +41,7 @@ Expected runConfiguration(const BenchmarkCode &Configuration, unsigned NumRepetitions, + unsigned LoopUnrollFactor, ArrayRef> Repetitors, bool DumpObjectToDisk) const; diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -133,7 +133,7 @@ } // namespace Expected BenchmarkRunner::runConfiguration( - const BenchmarkCode &BC, unsigned NumRepetitions, + const BenchmarkCode &BC, unsigned NumRepetitions, unsigned LoopUnrollFactor, ArrayRef> Repetitors, bool DumpObjectToDisk) const { InstructionBenchmark InstrBenchmark; @@ -142,6 +142,7 @@ InstrBenchmark.LLVMTriple = State.getTargetMachine().getTargetTriple().normalize(); InstrBenchmark.NumRepetitions = NumRepetitions; + InstrBenchmark.LoopUnrollFactor = LoopUnrollFactor; InstrBenchmark.Info = BC.Info; const std::vector &Instructions = BC.Key.Instructions; @@ -169,13 +170,15 @@ // the snippet for debug/analysis. This is so that the user clearly // understands that the inside instructions are repeated. constexpr const int kMinInstructionsForSnippet = 16; + constexpr const int kLoopUnrollFactorForSnippet = 2; { SmallString<0> Buffer; raw_svector_ostream OS(Buffer); if (Error E = assembleToStream( State.getExegesisTarget(), State.createTargetMachine(), BC.LiveIns, BC.Key.RegisterInitialValues, - Repetitor->Repeat(Instructions, kMinInstructionsForSnippet), + Repetitor->Repeat(Instructions, kMinInstructionsForSnippet, + kLoopUnrollFactorForSnippet), OS)) { return std::move(E); } @@ -188,7 +191,8 @@ // Assemble NumRepetitions instructions repetitions of the snippet for // measurements. const auto Filler = - Repetitor->Repeat(Instructions, InstrBenchmark.NumRepetitions); + Repetitor->Repeat(Instructions, InstrBenchmark.NumRepetitions, + InstrBenchmark.LoopUnrollFactor); object::OwningBinary ObjectFile; if (DumpObjectToDisk) { diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h --- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h +++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h @@ -39,7 +39,8 @@ // Returns a functor that repeats `Instructions` so that the function executes // at least `MinInstructions` instructions. virtual FillFunction Repeat(ArrayRef Instructions, - unsigned MinInstructions) const = 0; + unsigned MinInstructions, + unsigned LoopUnrollFactor) const = 0; explicit SnippetRepetitor(const LLVMState &State) : State(State) {} diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp --- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp +++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp @@ -11,6 +11,7 @@ #include "SnippetRepetitor.h" #include "Target.h" +#include "llvm/ADT/Sequence.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" @@ -24,8 +25,8 @@ // Repeats the snippet until there are at least MinInstructions in the // resulting code. - FillFunction Repeat(ArrayRef Instructions, - unsigned MinInstructions) const override { + FillFunction Repeat(ArrayRef Instructions, unsigned MinInstructions, + unsigned LoopUnrollFactor) const override { return [Instructions, MinInstructions](FunctionFiller &Filler) { auto Entry = Filler.getEntry(); if (!Instructions.empty()) { @@ -53,17 +54,19 @@ State.getTargetMachine().getTargetTriple())) {} // Loop over the snippet ceil(MinInstructions / Instructions.Size()) times. - FillFunction Repeat(ArrayRef Instructions, - unsigned MinInstructions) const override { - return [this, Instructions, MinInstructions](FunctionFiller &Filler) { + FillFunction Repeat(ArrayRef Instructions, unsigned MinInstructions, + unsigned LoopUnrollFactor) const override { + return [this, Instructions, MinInstructions, + LoopUnrollFactor](FunctionFiller &Filler) { const auto &ET = State.getExegesisTarget(); auto Entry = Filler.getEntry(); auto Loop = Filler.addBasicBlock(); auto Exit = Filler.addBasicBlock(); // Set loop counter to the right value: - const APInt LoopCount(32, (MinInstructions + Instructions.size() - 1) / - Instructions.size()); + const APInt LoopCount( + 32, + divideCeil(MinInstructions, LoopUnrollFactor * Instructions.size())); for (const MCInst &Inst : ET.setRegTo(State.getSubtargetInfo(), LoopCounter, LoopCount)) Entry.addInstruction(Inst); @@ -78,7 +81,10 @@ Loop.MBB->addLiveIn(Reg); for (const auto &LiveIn : Entry.MBB->liveins()) Loop.MBB->addLiveIn(LiveIn); - Loop.addInstructions(Instructions); + for (auto _ : seq(0U, LoopUnrollFactor)) { + (void)_; + Loop.addInstructions(Instructions); + } ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB, State.getInstrInfo()); diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -116,6 +116,12 @@ cl::desc("number of time to repeat the asm snippet"), cl::cat(BenchmarkOptions), cl::init(10000)); +static cl::opt LoopUnrollFactor( + "loop-unroll-factor", + cl::desc("when repeating the instruction snippet by looping over it, " + "unroll the loop body this many times"), + cl::cat(BenchmarkOptions), cl::init(0)); + static cl::opt MaxConfigsPerOpcode( "max-configs-per-opcode", cl::desc( @@ -365,7 +371,8 @@ for (const BenchmarkCode &Conf : Configurations) { InstructionBenchmark Result = ExitOnErr(Runner->runConfiguration( - Conf, NumRepetitions, Repetitors, DumpObjectToDisk)); + Conf, NumRepetitions, std::max(LoopUnrollFactor, 1U), + Repetitors, DumpObjectToDisk)); ExitOnFileError(BenchmarkFile, Result.writeYaml(State, BenchmarkFile)); } exegesis::pfm::pfmTerminate(); diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp --- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp +++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp @@ -42,11 +42,13 @@ const auto Repetitor = SnippetRepetitor::Create(RepetitionMode, State); const std::vector Instructions = {MCInstBuilder(X86::NOOP)}; FunctionFiller Sink(*MF, {X86::EAX}); - const auto Fill = Repetitor->Repeat(Instructions, kMinInstructions); + const auto Fill = + Repetitor->Repeat(Instructions, kMinInstructions, LoopUnrollFactor); Fill(Sink); } static constexpr const unsigned kMinInstructions = 3; + static constexpr const unsigned LoopUnrollFactor = 2; std::unique_ptr TM; std::unique_ptr Context; @@ -78,8 +80,8 @@ ASSERT_EQ(MF->getNumBlockIDs(), 3u); const auto &LoopBlock = *MF->getBlockNumbered(1); EXPECT_THAT(LoopBlock.instrs(), - ElementsAre(HasOpcode(X86::NOOP), HasOpcode(X86::ADD64ri8), - HasOpcode(X86::JCC_1))); + ElementsAre(HasOpcode(X86::NOOP), HasOpcode(X86::NOOP), + HasOpcode(X86::ADD64ri8), HasOpcode(X86::JCC_1))); EXPECT_THAT(LoopBlock.liveins(), UnorderedElementsAre( LiveReg(X86::EAX),