diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -41,7 +41,9 @@ #include "llvm/Support/Path.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetSelect.h" +#include "llvm/Support/ThreadPool.h" #include +#include #include namespace llvm { @@ -242,6 +244,18 @@ "and prints a message to access it (default = false)"), cl::cat(BenchmarkOptions), cl::init(false)); +static cl::opt + ThreadCount("j", + cl::desc("The number of threads to use for parallel operations " + "(default = 0 (autodetect))"), + cl::cat(Options), cl::init(0)); + +static cl::opt PerThreadBatchSize( + "thread-batch-size", + cl::desc("The batch size for parallel operations as it is not efficient to " + "run one task per thread (default = 0 (autodetect))"), + cl::cat(Options), cl::init(0)); + static ExitOnError ExitOnErr("llvm-exegesis error: "); // Helper function that logs the error(s) and exits. @@ -348,33 +362,97 @@ return Benchmarks; } -static void runBenchmarkConfigurations( - const LLVMState &State, ArrayRef Configurations, +static size_t GetNumConfigurationsPerBatch(const ThreadPool &Pool, + unsigned NumRepetitors) { + // We default to the "thread-batch-size" option. + size_t N = PerThreadBatchSize; + if (N == 0) // autodetect - just use thread count, a good-enough default. + N = Pool.getThreadCount(); + + // "thread-batch-size" option is specified per-thread, + // so multiply by the actual thread count. + N = SaturatingMultiply(N, Pool.getThreadCount()); + + // Also, each configuration runs for each repetitor, + // and we don't want the number of repetitors to affect + // the amount of work a single batch contains, + // so just divide by the number of repetitors. + N = divideCeil(N, NumRepetitors); + + assert(N > 0 && "Not processing anything!"); + return N; +} + +using ExpectedRunnableConfiguration = + std::optional>; +static constexpr int MaxRepetitors = 2; + +static void computeBatch( + std::optional> &Meter, + ArrayRef &Configurations, size_t NumConfigurationsPerBatch, + SmallVectorImpl> + &PerConfigRCs, ArrayRef> Repetitors, - const BenchmarkRunner &Runner) { - assert(!Configurations.empty() && "Don't have any configurations to run."); - std::optional FileOstr; - if (BenchmarkFile != "-") { - int ResultFD = 0; - // Create output file or open existing file and truncate it, once. - ExitOnErr(errorCodeToError(openFileForWrite(BenchmarkFile, ResultFD, - sys::fs::CD_CreateAlways, - sys::fs::OF_TextWithCRLF))); - FileOstr.emplace(ResultFD, true /*shouldClose*/); + ThreadPool &Pool, const BenchmarkRunner &Runner) { + // Onto next batch. + PerConfigRCs.clear(); + + // In each iteration, we deal with NumConfigurationsPerBatch-sized chunks. + ProgressMeter<>::ProgressMeterStep MeterStep(Meter ? &*Meter : nullptr); + ArrayRef ConfigurationBatch = + Configurations.take_front(NumConfigurationsPerBatch); + Configurations = Configurations.drop_front(ConfigurationBatch.size()); + + // For each configuration in batch: + PerConfigRCs.resize(ConfigurationBatch.size()); + for (auto C : zip(ConfigurationBatch, PerConfigRCs)) { + const BenchmarkCode &BC = std::get<0>(C); + SmallVectorImpl &RCsOfConfiguration = + std::get<1>(C); + + // For each configured repetitor: + RCsOfConfiguration.resize(Repetitors.size()); + for (auto R : zip(Repetitors, RCsOfConfiguration)) { + const SnippetRepetitor &Repetitor = *std::get<0>(R); + ExpectedRunnableConfiguration *Storage = &std::get<1>(R); + // Prepare an output slot for the task, without invalidating iterators. + // Create asyncronous task to generage Runnable Configuration + // for this configuration given this repetitor. This is thread-safe. + // NOTE: this does not run any measurements. This is codegen-only! + // NOTE: the task output into predetermined storage, + // which is in deterministic order. + Pool.async([BC, &Repetitor, &Runner, Storage]() { + *Storage = Runner.getRunnableConfiguration(BC, NumRepetitions, + LoopBodySize, Repetitor); + }); + } } - raw_ostream &Ostr = FileOstr ? *FileOstr : outs(); - std::optional> Meter; - if (BenchmarkMeasurementsPrintProgress) - Meter.emplace(Configurations.size()); - for (const BenchmarkCode &Conf : Configurations) { - ProgressMeter<>::ProgressMeterStep MeterStep(Meter ? &*Meter : nullptr); - SmallVector AllResults; + // We've scheduled all codegen tasks for all configurations X repetitions. + // Now, let's wait until they *ALL* complete. + Pool.wait(); +} - for (const std::unique_ptr &Repetitor : - Repetitors) { - auto RC = ExitOnErr(Runner.getRunnableConfiguration( - Conf, NumRepetitions, LoopBodySize, *Repetitor)); +static void runBatch( + const LLVMState &State, raw_ostream &Ostr, + MutableArrayRef> + PerConfigRCs, + const BenchmarkRunner &Runner) { + // And they've completed! Now, for each configuration in this batch: + SmallVector AllResults; + for (MutableArrayRef RCsOfConfiguration : + PerConfigRCs) { + assert(RCsOfConfiguration.size() <= MaxRepetitors); + AllResults.clear(); + AllResults.reserve(RCsOfConfiguration.size()); + // For each Runnable Configuration per repetitor: + for (ExpectedRunnableConfiguration &&ERC : + make_range(std::make_move_iterator(RCsOfConfiguration.begin()), + std::make_move_iterator(RCsOfConfiguration.end()))) { + assert(ERC); + auto RC = ExitOnErr(std::move(*ERC)); + // Now, actually run the final generated snippet, and measure it! + // NOTE: this is being done completely stand-alone and not in Pool! AllResults.emplace_back( ExitOnErr(Runner.runConfiguration(std::move(RC), DumpObjectToDisk))); } @@ -411,11 +489,53 @@ } } } - + // And output the results. + // NOTE: the order is deterministic! ExitOnFileError(BenchmarkFile, Result.writeYamlTo(State, Ostr)); } } +static void runBenchmarkConfigurations( + const LLVMState &State, ArrayRef Configurations, + ArrayRef> Repetitors, + const BenchmarkRunner &Runner) { + assert(!Configurations.empty() && "Don't have any configurations to run."); + assert(!Repetitors.empty() && Repetitors.size() <= MaxRepetitors && + "Unexpected Repetitor count."); + std::optional FileOstr; + if (BenchmarkFile != "-") { + int ResultFD = 0; + // Create output file or open existing file and truncate it, once. + ExitOnErr(errorCodeToError(openFileForWrite(BenchmarkFile, ResultFD, + sys::fs::CD_CreateAlways, + sys::fs::OF_TextWithCRLF))); + FileOstr.emplace(ResultFD, true /*shouldClose*/); + } + raw_ostream &Ostr = FileOstr ? *FileOstr : outs(); + + ThreadPool Pool(hardware_concurrency(ThreadCount)); + + SmallVector, 1> + PerConfigRCs; + + size_t NumConfigurationsPerBatch = + GetNumConfigurationsPerBatch(Pool, Repetitors.size()); + assert(NumConfigurationsPerBatch > 0 && "Not processing anything!"); + + PerConfigRCs.reserve( + std::min(NumConfigurationsPerBatch, Configurations.size())); + + std::optional> Meter; + if (BenchmarkMeasurementsPrintProgress) + Meter.emplace(divideCeil(Configurations.size(), NumConfigurationsPerBatch)); + // Outermost loop: run until we've processed all configurations. + while (!Configurations.empty()) { + computeBatch(Meter, Configurations, NumConfigurationsPerBatch, PerConfigRCs, + Repetitors, Pool, Runner); + runBatch(State, Ostr, PerConfigRCs, Runner); + } +} + void benchmarkMain() { if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure) { #ifndef HAVE_LIBPFM