diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst --- a/llvm/docs/CommandGuide/llvm-exegesis.rst +++ b/llvm/docs/CommandGuide/llvm-exegesis.rst @@ -189,7 +189,8 @@ `latency` mode can be make use of either RDTSC or LBR. `latency[LBR]` is only available on X86 (at least `Skylake`). - To run in `latency` mode, a positive value must be specified for `x86-lbr-sample-period` and `--repetition-mode=loop`. + To run in `latency` mode, a positive value must be specified + for `x86-lbr-sample-period` and `--repetition-mode=loop`. In `analysis` mode, you also need to specify at least one of the `-analysis-clusters-output-file=` and `-analysis-inconsistencies-output-file=`. @@ -202,23 +203,35 @@ On choosing the "right" sampling period, a small value is preferred, but throttling could occur if the sampling is too frequent. A prime number should be used to avoid consistently skipping certain blocks. - + .. option:: -repetition-mode=[duplicate|loop|min] Specify the repetition mode. `duplicate` will create a large, straight line - basic block with `num-repetitions` copies of the snippet. `loop` will wrap - the snippet in a loop which will be run `num-repetitions` times. The `loop` - mode tends to better hide the effects of the CPU frontend on architectures + basic block with `num-repetitions` instructions (repeating the snippet + `num-repetitions`/`snippet size` times). `loop` will, optionally, duplicate the + snippet until the loop until the loop body contains at least `loop-body-size` + instructions, and then wrap the result in a loop which will execute + `num-repetitions` instructions (thus, again, repeating the snippet + `num-repetitions`/`snippet size` times). The `loop` mode, especially with loop + unrolling tends to better hide the effects of the CPU frontend on architectures that cache decoded instructions, but consumes a register for counting - iterations. If performing an analysis over many opcodes, it may be best - to instead use the `min` mode, which will run each other mode, and produce - the minimal measured result. + iterations. If performing an analysis over many opcodes, it may be best to + instead use the `min` mode, which will run each other mode, and produce the + minimal measured result. .. option:: -num-repetitions= Specify the number of repetitions of the asm snippet. Higher values lead to more accurate measurements but lengthen the benchmark. +.. option:: -loop-body-size= + + Only effective for `-repetition-mode=[loop|min]`. + Instead of looping over the snippet directly, first duplicate it so that the + loop body contains at least this many instructions. This potentially results + in loop body being cached in the CPU Op Cache / Loop Cache, which allows to + which may have higher throughput than the CPU decoders. + .. option:: -max-configs-per-opcode= Specify the maximum configurations that can be generated for each opcode. diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h --- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h @@ -67,7 +67,7 @@ const MCInst &keyInstruction() const { return Key.Instructions[0]; } // The number of instructions inside the repeated snippet. For example, if a // snippet of 3 instructions is repeated 4 times, this is 12. - int NumRepetitions = 0; + unsigned NumRepetitions = 0; enum RepetitionModeE { Duplicate, Loop, AggregateMin }; // Note that measurements are per instruction. std::vector Measurements; diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h @@ -41,6 +41,7 @@ Expected runConfiguration(const BenchmarkCode &Configuration, unsigned NumRepetitions, + unsigned LoopUnrollFactor, ArrayRef> Repetitors, bool DumpObjectToDisk) const; diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -133,7 +133,7 @@ } // namespace Expected BenchmarkRunner::runConfiguration( - const BenchmarkCode &BC, unsigned NumRepetitions, + const BenchmarkCode &BC, unsigned NumRepetitions, unsigned LoopBodySize, ArrayRef> Repetitors, bool DumpObjectToDisk) const { InstructionBenchmark InstrBenchmark; @@ -168,14 +168,16 @@ // Assemble at least kMinInstructionsForSnippet instructions by repeating // the snippet for debug/analysis. This is so that the user clearly // understands that the inside instructions are repeated. - constexpr const int kMinInstructionsForSnippet = 16; + const int kMinInstructionsForSnippet = 4 * Instructions.size(); + const int kLoopBodySizeForSnippet = 2 * Instructions.size(); { SmallString<0> Buffer; raw_svector_ostream OS(Buffer); if (Error E = assembleToStream( State.getExegesisTarget(), State.createTargetMachine(), BC.LiveIns, BC.Key.RegisterInitialValues, - Repetitor->Repeat(Instructions, kMinInstructionsForSnippet), + Repetitor->Repeat(Instructions, kMinInstructionsForSnippet, + kLoopBodySizeForSnippet), OS)) { return std::move(E); } @@ -187,8 +189,8 @@ // Assemble NumRepetitions instructions repetitions of the snippet for // measurements. - const auto Filler = - Repetitor->Repeat(Instructions, InstrBenchmark.NumRepetitions); + const auto Filler = Repetitor->Repeat( + Instructions, InstrBenchmark.NumRepetitions, LoopBodySize); object::OwningBinary ObjectFile; if (DumpObjectToDisk) { diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h --- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h +++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h @@ -39,7 +39,8 @@ // Returns a functor that repeats `Instructions` so that the function executes // at least `MinInstructions` instructions. virtual FillFunction Repeat(ArrayRef Instructions, - unsigned MinInstructions) const = 0; + unsigned MinInstructions, + unsigned LoopBodySize) const = 0; explicit SnippetRepetitor(const LLVMState &State) : State(State) {} diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp --- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp +++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp @@ -11,6 +11,7 @@ #include "SnippetRepetitor.h" #include "Target.h" +#include "llvm/ADT/Sequence.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" @@ -24,8 +25,8 @@ // Repeats the snippet until there are at least MinInstructions in the // resulting code. - FillFunction Repeat(ArrayRef Instructions, - unsigned MinInstructions) const override { + FillFunction Repeat(ArrayRef Instructions, unsigned MinInstructions, + unsigned LoopBodySize) const override { return [Instructions, MinInstructions](FunctionFiller &Filler) { auto Entry = Filler.getEntry(); if (!Instructions.empty()) { @@ -53,17 +54,26 @@ State.getTargetMachine().getTargetTriple())) {} // Loop over the snippet ceil(MinInstructions / Instructions.Size()) times. - FillFunction Repeat(ArrayRef Instructions, - unsigned MinInstructions) const override { - return [this, Instructions, MinInstructions](FunctionFiller &Filler) { + FillFunction Repeat(ArrayRef Instructions, unsigned MinInstructions, + unsigned LoopBodySize) const override { + return [this, Instructions, MinInstructions, + LoopBodySize](FunctionFiller &Filler) { const auto &ET = State.getExegesisTarget(); auto Entry = Filler.getEntry(); auto Loop = Filler.addBasicBlock(); auto Exit = Filler.addBasicBlock(); + const unsigned LoopUnrollFactor = + LoopBodySize <= Instructions.size() + ? 1 + : divideCeil(LoopBodySize, Instructions.size()); + assert(LoopUnrollFactor >= 1 && "Should end up with at least 1 snippet."); + // Set loop counter to the right value: - const APInt LoopCount(32, (MinInstructions + Instructions.size() - 1) / - Instructions.size()); + const APInt LoopCount( + 32, + divideCeil(MinInstructions, LoopUnrollFactor * Instructions.size())); + assert(LoopCount.uge(1) && "Trip count should be at least 1."); for (const MCInst &Inst : ET.setRegTo(State.getSubtargetInfo(), LoopCounter, LoopCount)) Entry.addInstruction(Inst); @@ -78,7 +88,10 @@ Loop.MBB->addLiveIn(Reg); for (const auto &LiveIn : Entry.MBB->liveins()) Loop.MBB->addLiveIn(LiveIn); - Loop.addInstructions(Instructions); + for (auto _ : seq(0U, LoopUnrollFactor)) { + (void)_; + Loop.addInstructions(Instructions); + } ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB, State.getInstrInfo()); diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -116,6 +116,13 @@ cl::desc("number of time to repeat the asm snippet"), cl::cat(BenchmarkOptions), cl::init(10000)); +static cl::opt + LoopBodySize("loop-body-size", + cl::desc("when repeating the instruction snippet by looping " + "over it, duplicate the snippet until the loop body " + "contains at least this many instruction"), + cl::cat(BenchmarkOptions), cl::init(0)); + static cl::opt MaxConfigsPerOpcode( "max-configs-per-opcode", cl::desc( @@ -365,7 +372,7 @@ for (const BenchmarkCode &Conf : Configurations) { InstructionBenchmark Result = ExitOnErr(Runner->runConfiguration( - Conf, NumRepetitions, Repetitors, DumpObjectToDisk)); + Conf, NumRepetitions, LoopBodySize, Repetitors, DumpObjectToDisk)); ExitOnFileError(BenchmarkFile, Result.writeYaml(State, BenchmarkFile)); } exegesis::pfm::pfmTerminate(); diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp --- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp +++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp @@ -42,11 +42,13 @@ const auto Repetitor = SnippetRepetitor::Create(RepetitionMode, State); const std::vector Instructions = {MCInstBuilder(X86::NOOP)}; FunctionFiller Sink(*MF, {X86::EAX}); - const auto Fill = Repetitor->Repeat(Instructions, kMinInstructions); + const auto Fill = + Repetitor->Repeat(Instructions, kMinInstructions, kLoopBodySize); Fill(Sink); } static constexpr const unsigned kMinInstructions = 3; + static constexpr const unsigned kLoopBodySize = 5; std::unique_ptr TM; std::unique_ptr Context; @@ -78,7 +80,9 @@ ASSERT_EQ(MF->getNumBlockIDs(), 3u); const auto &LoopBlock = *MF->getBlockNumbered(1); EXPECT_THAT(LoopBlock.instrs(), - ElementsAre(HasOpcode(X86::NOOP), HasOpcode(X86::ADD64ri8), + ElementsAre(HasOpcode(X86::NOOP), HasOpcode(X86::NOOP), + HasOpcode(X86::NOOP), HasOpcode(X86::NOOP), + HasOpcode(X86::NOOP), HasOpcode(X86::ADD64ri8), HasOpcode(X86::JCC_1))); EXPECT_THAT(LoopBlock.liveins(), UnorderedElementsAre(