diff --git a/llvm/test/tools/llvm-reduce/operands-skip.ll b/llvm/test/tools/llvm-reduce/operands-skip.ll --- a/llvm/test/tools/llvm-reduce/operands-skip.ll +++ b/llvm/test/tools/llvm-reduce/operands-skip.ll @@ -1,6 +1,13 @@ ; RUN: llvm-reduce %s -o %t --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file ; RUN: FileCheck %s --input-file %t --check-prefixes=REDUCED +; RUN: llvm-reduce -max-chunk-threads=2 %s -o %t.1 --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file +; RUN: FileCheck %s --input-file %t.1 --check-prefixes=REDUCED + +; RUN: llvm-reduce -max-chunk-threads=4 %s -o %t.2 --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file +; RUN: FileCheck %s --input-file %t.2 --check-prefixes=REDUCED + + ; INTERESTING: store i32 43, i32* {{(%imm|%indirect)}}, align 4 ; REDUCED: store i32 43, i32* %imm, align 4 diff --git a/llvm/tools/llvm-reduce/deltas/Delta.cpp b/llvm/tools/llvm-reduce/deltas/Delta.cpp --- a/llvm/tools/llvm-reduce/deltas/Delta.cpp +++ b/llvm/tools/llvm-reduce/deltas/Delta.cpp @@ -15,8 +15,11 @@ #include "Delta.h" #include "ReducerWorkItem.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/IR/Verifier.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ThreadPool.h" #include "llvm/Support/ToolOutputFile.h" #include #include @@ -31,6 +34,12 @@ "starting-granularity-level", cl::desc("Number of times to divide chunks prior to first test")); +static cl::opt MaxChunkThreads( + "max-chunk-threads", + cl::desc("Maximum number of threads to use to process chunks. Set to 1 to " + "disables parallelism. Maximum capped to 32."), + cl::init(1)); + void writeOutput(ReducerWorkItem &M, llvm::StringRef Message); bool isReduced(ReducerWorkItem &M, TestRunner &Test, @@ -135,6 +144,12 @@ increaseGranularity(ChunksStillConsideredInteresting); } + MaxChunkThreads = std::min(unsigned(MaxChunkThreads), 32u); + ThreadPool ChunkThreadPool(hardware_concurrency(MaxChunkThreads)); + + unsigned NumTasks = MaxChunkThreads * 2; + SmallVector> Results(MaxChunkThreads * 2, {}); + bool FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity; do { FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity = false; @@ -145,7 +160,8 @@ // modified module if the chunk resulted in a reduction. auto CheckChunk = [&UninterestingChunks, &Test, &ExtractChunksFromModule, &ChunksStillConsideredInteresting]( - Chunk &ChunkToCheckForUninterestingness) + Chunk &ChunkToCheckForUninterestingness, + std::unique_ptr Clone) -> std::unique_ptr { // Take all of ChunksStillConsideredInteresting chunks, except those we've // already deemed uninteresting (UninterestingChunks) but didn't remove @@ -160,9 +176,6 @@ C != ChunkToCheckForUninterestingness; }); - // Clone module before hacking it up.. - std::unique_ptr Clone = - cloneReducerWorkItem(Test.getProgram()); // Generate Module with only Targets inside Current Chunks Oracle O(CurrentChunks); ExtractChunksFromModule(O, *Clone); @@ -192,13 +205,84 @@ return Clone; }; - for (Chunk &ChunkToCheckForUninterestingness : - reverse(ChunksStillConsideredInteresting)) { - std::unique_ptr Result = - CheckChunk(ChunkToCheckForUninterestingness); + // Disable parallel processing for MIR reduction for now. + if (Test.getProgram().MF) + MaxChunkThreads = 1; + + // When running with more than one thread, serialize the original bitcode to + // OriginalBC. + SmallString<0> OriginalBC; + if (MaxChunkThreads > 1) { + raw_svector_ostream BCOS(OriginalBC); + WriteBitcodeToFile(*Test.getProgram().M, BCOS); + } + + for (auto I = ChunksStillConsideredInteresting.rbegin(), + E = ChunksStillConsideredInteresting.rend(); + I != E; ++I) { + std::unique_ptr Result = nullptr; + unsigned WorkLeft = std::distance(I, E); + // Run in parallel mode, if the user requested more than one thread and + // there are at least a few chunks to process. + if (MaxChunkThreads > 1 && WorkLeft > 4) { + NumTasks = std::min(WorkLeft, unsigned(MaxChunkThreads)); + + // Process NumTasks chunks in parallel using ChunkThreadPool. When the + // tasks are added to the pool, parse the original module from + // OriginalBC with a fresh LLVMContext object. This ensures that the + // cloned module of each task uses an independent LLVMContext object. If + // a task reduces the input, serialize the result back in the + // corresponding Result element. + for (unsigned J = 0; J < NumTasks; ++J) { + Results[J] = ""; + ChunkThreadPool.async([&Results, J, I, &CheckChunk, &OriginalBC]() { + LLVMContext Ctx; + Expected> MOrErr = parseBitcodeFile( + MemoryBufferRef(StringRef(OriginalBC.data(), OriginalBC.size()), + ""), + Ctx); + if (!MOrErr) + report_fatal_error("Failed to read bitcode"); + auto CloneMMM = std::make_unique(); + CloneMMM->M = std::move(MOrErr.get()); + + auto Res = CheckChunk(*(I + J), std::move(CloneMMM)); + if (Res) { + raw_svector_ostream BCOS(Results[J]); + WriteBitcodeToFile(*Res->M, BCOS); + } + }); + } + ChunkThreadPool.wait(); + + // Process the results after all jobs have finished. The first non-null + // result module is used. If multiple jobs successfully reduced a chunk, + // the other results are ignored.This ensures parallel processing + // produces the same results as serial processing. + for (unsigned J = 0; J < NumTasks; ++J) { + if (Results[J].empty()) + continue; + + Expected> MOrErr = parseBitcodeFile( + MemoryBufferRef(StringRef(Results[J].data(), Results[J].size()), + ""), + Test.getProgram().M->getContext()); + if (!MOrErr) + report_fatal_error("Failed to read bitcode"); + Result = std::make_unique(); + Result->M = std::move(MOrErr.get()); + I += J; + break; + } + if (!Result) + I += NumTasks - 1; + } else { + Result = CheckChunk(*I, cloneReducerWorkItem(Test.getProgram())); + } if (!Result) continue; + Chunk &ChunkToCheckForUninterestingness = *I; FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity = true; UninterestingChunks.insert(ChunkToCheckForUninterestingness); ReducedProgram = std::move(Result);