Index: docs/CommandGuide/llvm-profdata.rst =================================================================== --- docs/CommandGuide/llvm-profdata.rst +++ docs/CommandGuide/llvm-profdata.rst @@ -106,6 +106,11 @@ conjunction with -instr. Defaults to false, since it can inhibit compiler optimization during PGO. +.. option:: -num-threads=N, -j=N + + Use N threads to perform profile merging. When N=0, llvm-profdata auto-detects + an appropriate number of threads to use. This is the default. + EXAMPLES ^^^^^^^^ Basic Usage Index: include/llvm/ProfileData/InstrProfWriter.h =================================================================== --- include/llvm/ProfileData/InstrProfWriter.h +++ include/llvm/ProfileData/InstrProfWriter.h @@ -47,6 +47,8 @@ /// for this function and the hash and number of counts match, each counter is /// summed. Optionally scale counts by \p Weight. Error addRecord(InstrProfRecord &&I, uint64_t Weight = 1); + /// Merge existing function counts from the given writer. + Error mergeRecordsFromWriter(InstrProfWriter &&IPW); /// Write the profile to \c OS void write(raw_fd_ostream &OS); /// Write the profile in text format to \c OS Index: lib/ProfileData/InstrProfWriter.cpp =================================================================== --- lib/ProfileData/InstrProfWriter.cpp +++ lib/ProfileData/InstrProfWriter.cpp @@ -182,6 +182,14 @@ return Dest.takeError(); } +Error InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW) { + for (auto &I : IPW.FunctionData) + for (auto &Func : I.getValue()) + if (Error E = addRecord(std::move(Func.second), 1)) + return E; + return Error::success(); +} + bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) { if (!Sparse) return true; Index: test/tools/llvm-profdata/multiple-inputs.test =================================================================== --- test/tools/llvm-profdata/multiple-inputs.test +++ test/tools/llvm-profdata/multiple-inputs.test @@ -51,3 +51,39 @@ DISJOINT: Total functions: 2 DISJOINT: Maximum function count: 1 DISJOINT: Maximum internal block count: 3 + +RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \ +RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \ +RUN: -num-threads 2 -o %t +RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO4 +RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \ +RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \ +RUN: -j 3 -o %t +RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO4 +FOO4: foo: +FOO4: Counters: 3 +FOO4: Function count: 4 +FOO4: Block counts: [8, 12] +FOO4: Total functions: 1 +FOO4: Maximum function count: 4 +FOO4: Maximum internal block count: 12 + +RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \ +RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \ +RUN: %p/Inputs/foo3-1.proftext -j 2 -o %t +RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5 +RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \ +RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \ +RUN: %p/Inputs/foo3-1.proftext -j 3 -o %t +RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5 +RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \ +RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \ +RUN: %p/Inputs/foo3-1.proftext -o %t +RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5 +FOO5: foo: +FOO5: Counters: 3 +FOO5: Function count: 5 +FOO5: Block counts: [10, 15] +FOO5: Total functions: 1 +FOO5: Maximum function count: 5 +FOO5: Maximum internal block count: 15 Index: tools/llvm-profdata/llvm-profdata.cpp =================================================================== --- tools/llvm-profdata/llvm-profdata.cpp +++ tools/llvm-profdata/llvm-profdata.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/Path.h" #include "llvm/Support/PrettyStackTrace.h" #include "llvm/Support/Signals.h" +#include "llvm/Support/ThreadPool.h" #include "llvm/Support/raw_ostream.h" #include @@ -117,9 +118,68 @@ }; typedef SmallVector WeightedFileVector; +/// Keep track of merged data and reported errors. +struct WriterContext { + std::mutex Lock; + InstrProfWriter Writer; + Error Err; + StringRef ErrWhence; + std::mutex &ErrLock; + SmallSet &WriterErrorCodes; + + WriterContext(bool IsSparse, std::mutex &ErrLock, + SmallSet &WriterErrorCodes) + : Lock(), Writer(IsSparse), Err(Error::success()), ErrWhence(""), + ErrLock(ErrLock), WriterErrorCodes(WriterErrorCodes) {} +}; + +/// Load an input into a writer context. +static void loadInput(const WeightedFile &Input, WriterContext *WC) { + std::unique_lock CtxLock{WC->Lock}; + + // If there's a pending hard error, don't do more work. + if (WC->Err) + return; + + WC->ErrWhence = Input.Filename; + + auto ReaderOrErr = InstrProfReader::create(Input.Filename); + if ((WC->Err = ReaderOrErr.takeError())) + return; + + auto Reader = std::move(ReaderOrErr.get()); + bool IsIRProfile = Reader->isIRLevelProfile(); + if (WC->Writer.setIsIRLevelProfile(IsIRProfile)) { + WC->Err = make_error( + "Merge IR generated profile with Clang generated profile.", + std::error_code()); + return; + } + + for (auto &I : *Reader) { + if (Error E = WC->Writer.addRecord(std::move(I), Input.Weight)) { + // Only show hint the first time an error occurs. + instrprof_error IPE = InstrProfError::take(std::move(E)); + std::unique_lock ErrorGuard{WC->ErrLock}; + bool firstTime = WC->WriterErrorCodes.insert(IPE).second; + handleMergeWriterError(make_error(IPE), Input.Filename, + I.Name, firstTime); + } + } + if (Reader->hasError()) + WC->Err = Reader->getError(); +} + +/// Merge two writer contexts together. +static void mergeWriterContexts(WriterContext *Dst, WriterContext *Src) { + if (Error E = Dst->Writer.mergeRecordsFromWriter(std::move(Src->Writer))) + Dst->Err = std::move(E); +}; + static void mergeInstrProfile(const WeightedFileVector &Inputs, StringRef OutputFilename, - ProfileFormat OutputFormat, bool OutputSparse) { + ProfileFormat OutputFormat, bool OutputSparse, + unsigned NumThreads) { if (OutputFilename.compare("-") == 0) exitWithError("Cannot write indexed profdata format to stdout."); @@ -131,30 +191,57 @@ if (EC) exitWithErrorCode(EC, OutputFilename); - InstrProfWriter Writer(OutputSparse); + std::mutex ErrorLock; SmallSet WriterErrorCodes; - for (const auto &Input : Inputs) { - auto ReaderOrErr = InstrProfReader::create(Input.Filename); - if (Error E = ReaderOrErr.takeError()) - exitWithError(std::move(E), Input.Filename); - - auto Reader = std::move(ReaderOrErr.get()); - bool IsIRProfile = Reader->isIRLevelProfile(); - if (Writer.setIsIRLevelProfile(IsIRProfile)) - exitWithError("Merge IR generated profile with Clang generated profile."); - - for (auto &I : *Reader) { - if (Error E = Writer.addRecord(std::move(I), Input.Weight)) { - // Only show hint the first time an error occurs. - instrprof_error IPE = InstrProfError::take(std::move(E)); - bool firstTime = WriterErrorCodes.insert(IPE).second; - handleMergeWriterError(make_error(IPE), Input.Filename, - I.Name, firstTime); - } + + // If NumThreads is not specified, auto-detect a good default. + if (NumThreads == 0) + NumThreads = std::max(1U, std::min(std::thread::hardware_concurrency(), + unsigned(Inputs.size() / 2))); + + // Initialize the writer contexts. + SmallVector, 4> Contexts; + for (unsigned I = 0; I < NumThreads; ++I) + Contexts.emplace_back(llvm::make_unique( + OutputSparse, ErrorLock, WriterErrorCodes)); + + if (NumThreads == 1) { + for (const auto &Input : Inputs) + loadInput(Input, Contexts[0].get()); + } else { + ThreadPool Pool(NumThreads); + + // Load the inputs in parallel (N/NumThreads serial steps). + unsigned Ctx = 0; + for (const auto &Input : Inputs) { + Pool.async(loadInput, Input, Contexts[Ctx].get()); + Ctx = (Ctx + 1) % NumThreads; } - if (Reader->hasError()) - exitWithError(Reader->getError(), Input.Filename); + Pool.wait(); + + // Merge the writer contexts together (lg(NumThreads) serial steps). + unsigned Mid = Contexts.size() / 2; + unsigned End = Contexts.size(); + assert(Mid > 0 && "Expected more than one context"); + do { + for (unsigned I = 0; I < Mid; ++I) + Pool.async(mergeWriterContexts, Contexts[I].get(), + Contexts[I + Mid].get()); + if (End & 1) + Pool.async(mergeWriterContexts, Contexts[0].get(), + Contexts[End - 1].get()); + Pool.wait(); + End = Mid; + Mid /= 2; + } while (Mid > 0); } + + // Handle deferred hard errors encountered during merging. + for (std::unique_ptr &WC : Contexts) + if (WC->Err) + exitWithError(std::move(WC->Err), WC->ErrWhence); + + InstrProfWriter &Writer = Contexts[0]->Writer; if (OutputFormat == PF_Text) Writer.writeText(Output); else @@ -288,6 +375,11 @@ clEnumValEnd)); cl::opt OutputSparse("sparse", cl::init(false), cl::desc("Generate a sparse profile (only meaningful for -instr)")); + cl::opt NumThreads( + "num-threads", cl::init(0), + cl::desc("Number of merge threads to use (default: autodetect)")); + cl::alias NumThreadsA("j", cl::desc("Alias for --num-threads"), + cl::aliasopt(NumThreads)); cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n"); @@ -314,7 +406,7 @@ if (ProfileKind == instr) mergeInstrProfile(WeightedInputs, OutputFilename, OutputFormat, - OutputSparse); + OutputSparse, NumThreads); else mergeSampleProfile(WeightedInputs, OutputFilename, OutputFormat); Index: unittests/ProfileData/InstrProfTest.cpp =================================================================== --- unittests/ProfileData/InstrProfTest.cpp +++ unittests/ProfileData/InstrProfTest.cpp @@ -204,6 +204,31 @@ delete PSFromMD; } +TEST_F(InstrProfTest, test_writer_merge) { + InstrProfRecord Record1("func1", 0x1234, {42}); + NoError(Writer.addRecord(std::move(Record1))); + + InstrProfWriter Writer2; + InstrProfRecord Record2("func2", 0x1234, {0, 0}); + NoError(Writer2.addRecord(std::move(Record2))); + + NoError(Writer.mergeRecordsFromWriter(std::move(Writer2))); + + auto Profile = Writer.writeBuffer(); + readProfile(std::move(Profile)); + + Expected R = Reader->getInstrProfRecord("func1", 0x1234); + ASSERT_TRUE(NoError(R.takeError())); + ASSERT_EQ(1U, R->Counts.size()); + ASSERT_EQ(42U, R->Counts[0]); + + R = Reader->getInstrProfRecord("func2", 0x1234); + ASSERT_TRUE(NoError(R.takeError())); + ASSERT_EQ(2U, R->Counts.size()); + ASSERT_EQ(0U, R->Counts[0]); + ASSERT_EQ(0U, R->Counts[1]); +} + static const char callee1[] = "callee1"; static const char callee2[] = "callee2"; static const char callee3[] = "callee3";