diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -41,7 +41,9 @@ #include "llvm/Support/Path.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetSelect.h" +#include "llvm/Support/ThreadPool.h" #include +#include #include namespace llvm { @@ -242,6 +244,18 @@ "and prints a message to access it (default = false)"), cl::cat(BenchmarkOptions), cl::init(false)); +static cl::opt + ThreadCount("j", + cl::desc("The number of threads to use for parallel operations " + "(default = 0 (autodetect))"), + cl::cat(Options), cl::init(0)); + +static cl::opt PerThreadBatchSize( + "thread-batch-size", + cl::desc("The batch size for parallel operations as it is not efficient to " + "run one task per thread (default = 0 (autodetect))"), + cl::cat(Options), cl::init(0)); + static ExitOnError ExitOnErr("llvm-exegesis error: "); // Helper function that logs the error(s) and exits. @@ -348,11 +362,151 @@ return Benchmarks; } +static size_t GetNumConfigurationsPerBatch(const ThreadPool &Pool, + unsigned NumRepetitors) { + // We default to the "thread-batch-size" option. + size_t N = PerThreadBatchSize; + if (N == 0) // autodetect - just use thread count, a good-enough default. + N = Pool.getThreadCount(); + + // "thread-batch-size" option is specified per-thread, + // so multiply by the actual thread count. + N = SaturatingMultiply(N, Pool.getThreadCount()); + + // Also, each configuration runs for each repetitor, + // and we don't want the number of repetitors to affect + // the amount of work a single batch contains, + // so just divide by the number of repetitors. + N = divideCeil(N, NumRepetitors); + + assert(N > 0 && "Not processing anything!"); + return N; +} + +using ExpectedRunnableConfiguration = + std::optional>; +static constexpr int MaxRepetitors = 2; + +static void computeBatch( + std::optional> &Meter, + ArrayRef &Configurations, size_t NumConfigurationsPerBatch, + SmallVectorImpl> + &PerConfigRCs, + ArrayRef> Repetitors, + ThreadPool &Pool, const BenchmarkRunner &Runner) { + // Onto next batch. + PerConfigRCs.clear(); + + // In each iteration, we deal with NumConfigurationsPerBatch-sized chunks. + ProgressMeter<>::ProgressMeterStep MeterStep(Meter ? &*Meter : nullptr); + ArrayRef ConfigurationBatch = + Configurations.take_front(NumConfigurationsPerBatch); + Configurations = Configurations.drop_front(ConfigurationBatch.size()); + + // For each configuration in batch: + PerConfigRCs.resize(ConfigurationBatch.size()); + for (auto C : zip(ConfigurationBatch, PerConfigRCs)) { + const BenchmarkCode &BC = std::get<0>(C); + SmallVectorImpl &RCsOfConfiguration = + std::get<1>(C); + + // For each configured repetitor: + RCsOfConfiguration.resize(Repetitors.size()); + for (auto R : zip(Repetitors, RCsOfConfiguration)) { + const SnippetRepetitor &Repetitor = *std::get<0>(R); + ExpectedRunnableConfiguration *Storage = &std::get<1>(R); + // Prepare an output slot for the task, without invalidating iterators. + // Create asyncronous task to generage Runnable Configuration + // for this configuration given this repetitor. This is thread-safe. + // NOTE: this does not run any measurements. This is codegen-only! + // NOTE: the task output into predetermined storage, + // which is in deterministic order. + Pool.async([BC, &Repetitor, &Runner, Storage]() { + *Storage = Runner.getRunnableConfiguration(BC, NumRepetitions, + LoopBodySize, Repetitor); + }); + } + } + + // We've scheduled all codegen tasks for all configurations X repetitions. + // Now, let's wait until they *ALL* complete. + Pool.wait(); +} + +static void runOneConfiguration( + const LLVMState &State, raw_ostream &Ostr, + MutableArrayRef RCsOfConfiguration, + const BenchmarkRunner &Runner) { + // And they've completed! Now, for each configuration in this batch: + SmallVector AllResults; + assert(RCsOfConfiguration.size() <= MaxRepetitors); + AllResults.reserve(RCsOfConfiguration.size()); + // For each Runnable Configuration per repetitor: + for (ExpectedRunnableConfiguration &&ERC : + make_range(std::make_move_iterator(RCsOfConfiguration.begin()), + std::make_move_iterator(RCsOfConfiguration.end()))) { + assert(ERC && "The task did finish."); + auto RC = ExitOnErr(std::move(*ERC)); + // Now, actually run the final generated snippet, and measure it! + // NOTE: this is being done completely stand-alone and not in Pool! + AllResults.emplace_back( + ExitOnErr(Runner.runConfiguration(std::move(RC), DumpObjectToDisk))); + } + InstructionBenchmark &Result = AllResults.front(); + + // If any of our measurements failed, pretend they all have failed. + if (AllResults.size() > 1 && + any_of(AllResults, [](const InstructionBenchmark &R) { + return R.Measurements.empty(); + })) + Result.Measurements.clear(); + + if (RepetitionMode == InstructionBenchmark::RepetitionModeE::AggregateMin) { + for (const InstructionBenchmark &OtherResult : + ArrayRef(AllResults).drop_front()) { + llvm::append_range(Result.AssembledSnippet, OtherResult.AssembledSnippet); + // Aggregate measurements, but only iff all measurements succeeded. + if (Result.Measurements.empty()) + continue; + assert(OtherResult.Measurements.size() == Result.Measurements.size() && + "Expected to have identical number of measurements."); + for (auto I : zip(Result.Measurements, OtherResult.Measurements)) { + BenchmarkMeasure &Measurement = std::get<0>(I); + const BenchmarkMeasure &NewMeasurement = std::get<1>(I); + assert(Measurement.Key == NewMeasurement.Key && + "Expected measurements to be symmetric"); + + Measurement.PerInstructionValue = + std::min(Measurement.PerInstructionValue, + NewMeasurement.PerInstructionValue); + Measurement.PerSnippetValue = std::min(Measurement.PerSnippetValue, + NewMeasurement.PerSnippetValue); + } + } + } + // And output the results. + // NOTE: the order is deterministic! + ExitOnFileError(BenchmarkFile, Result.writeYamlTo(State, Ostr)); +} + +static void runBatch( + const LLVMState &State, raw_ostream &Ostr, + MutableArrayRef> + PerConfigRCs, + const BenchmarkRunner &Runner) { + // And they've completed! Now, for each configuration in this batch: + for (MutableArrayRef RCsOfConfiguration : + PerConfigRCs) + runOneConfiguration(State, Ostr, RCsOfConfiguration, Runner); +} + static void runBenchmarkConfigurations( const LLVMState &State, ArrayRef Configurations, ArrayRef> Repetitors, const BenchmarkRunner &Runner) { assert(!Configurations.empty() && "Don't have any configurations to run."); + assert(!Repetitors.empty() && Repetitors.size() <= MaxRepetitors && + "Unexpected Repetitor count."); std::optional FileOstr; if (BenchmarkFile != "-") { int ResultFD = 0; @@ -364,55 +518,26 @@ } raw_ostream &Ostr = FileOstr ? *FileOstr : outs(); + ThreadPool Pool(hardware_concurrency(ThreadCount)); + + SmallVector, 1> + PerConfigRCs; + + size_t NumConfigurationsPerBatch = + GetNumConfigurationsPerBatch(Pool, Repetitors.size()); + assert(NumConfigurationsPerBatch > 0 && "Not processing anything!"); + + PerConfigRCs.reserve( + std::min(NumConfigurationsPerBatch, Configurations.size())); + std::optional> Meter; if (BenchmarkMeasurementsPrintProgress) - Meter.emplace(Configurations.size()); - for (const BenchmarkCode &Conf : Configurations) { - ProgressMeter<>::ProgressMeterStep MeterStep(Meter ? &*Meter : nullptr); - SmallVector AllResults; - - for (const std::unique_ptr &Repetitor : - Repetitors) { - auto RC = ExitOnErr(Runner.getRunnableConfiguration( - Conf, NumRepetitions, LoopBodySize, *Repetitor)); - AllResults.emplace_back( - ExitOnErr(Runner.runConfiguration(std::move(RC), DumpObjectToDisk))); - } - InstructionBenchmark &Result = AllResults.front(); - - // If any of our measurements failed, pretend they all have failed. - if (AllResults.size() > 1 && - any_of(AllResults, [](const InstructionBenchmark &R) { - return R.Measurements.empty(); - })) - Result.Measurements.clear(); - - if (RepetitionMode == InstructionBenchmark::RepetitionModeE::AggregateMin) { - for (const InstructionBenchmark &OtherResult : - ArrayRef(AllResults).drop_front()) { - llvm::append_range(Result.AssembledSnippet, - OtherResult.AssembledSnippet); - // Aggregate measurements, but only iff all measurements succeeded. - if (Result.Measurements.empty()) - continue; - assert(OtherResult.Measurements.size() == Result.Measurements.size() && - "Expected to have identical number of measurements."); - for (auto I : zip(Result.Measurements, OtherResult.Measurements)) { - BenchmarkMeasure &Measurement = std::get<0>(I); - const BenchmarkMeasure &NewMeasurement = std::get<1>(I); - assert(Measurement.Key == NewMeasurement.Key && - "Expected measurements to be symmetric"); - - Measurement.PerInstructionValue = - std::min(Measurement.PerInstructionValue, - NewMeasurement.PerInstructionValue); - Measurement.PerSnippetValue = std::min( - Measurement.PerSnippetValue, NewMeasurement.PerSnippetValue); - } - } - } - - ExitOnFileError(BenchmarkFile, Result.writeYamlTo(State, Ostr)); + Meter.emplace(divideCeil(Configurations.size(), NumConfigurationsPerBatch)); + // Outermost loop: run until we've processed all configurations. + while (!Configurations.empty()) { + computeBatch(Meter, Configurations, NumConfigurationsPerBatch, PerConfigRCs, + Repetitors, Pool, Runner); + runBatch(State, Ostr, PerConfigRCs, Runner); } }