diff --git a/llvm/test/tools/llvm-reduce/operands-skip.ll b/llvm/test/tools/llvm-reduce/operands-skip.ll --- a/llvm/test/tools/llvm-reduce/operands-skip.ll +++ b/llvm/test/tools/llvm-reduce/operands-skip.ll @@ -1,6 +1,13 @@ ; RUN: llvm-reduce %s -o %t --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file ; RUN: FileCheck %s --input-file %t --check-prefixes=REDUCED +; RUN: llvm-reduce -j 2 %s -o %t.1 --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file +; RUN: FileCheck %s --input-file %t.1 --check-prefixes=REDUCED + +; RUN: llvm-reduce -j 4 %s -o %t.2 --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file +; RUN: FileCheck %s --input-file %t.2 --check-prefixes=REDUCED + + ; INTERESTING: store i32 43, i32* {{(%imm|%indirect)}}, align 4 ; REDUCED: store i32 43, i32* %imm, align 4 diff --git a/llvm/tools/llvm-reduce/CMakeLists.txt b/llvm/tools/llvm-reduce/CMakeLists.txt --- a/llvm/tools/llvm-reduce/CMakeLists.txt +++ b/llvm/tools/llvm-reduce/CMakeLists.txt @@ -3,6 +3,7 @@ AllTargetsCodeGens AllTargetsDescs AllTargetsInfos + BitReader BitWriter CodeGen Core diff --git a/llvm/tools/llvm-reduce/deltas/Delta.cpp b/llvm/tools/llvm-reduce/deltas/Delta.cpp --- a/llvm/tools/llvm-reduce/deltas/Delta.cpp +++ b/llvm/tools/llvm-reduce/deltas/Delta.cpp @@ -15,9 +15,11 @@ #include "Delta.h" #include "ReducerWorkItem.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/IR/Verifier.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ThreadPool.h" #include "llvm/Support/ToolOutputFile.h" #include #include @@ -37,6 +39,12 @@ cl::desc("Write temporary files as bitcode, instead of textual IR"), cl::init(false)); +static cl::opt NumJobs( + "j", + cl::desc("Maximum number of threads to use to process chunks. Set to 1 to " + "disables parallelism. Maximum capped to 32."), + cl::init(1)); + void writeOutput(ReducerWorkItem &M, llvm::StringRef Message); bool isReduced(ReducerWorkItem &M, TestRunner &Test, @@ -120,7 +128,8 @@ // modified module if the chunk resulted in a reduction. template static std::unique_ptr -CheckChunk(Chunk &ChunkToCheckForUninterestingness, TestRunner &Test, +CheckChunk(Chunk &ChunkToCheckForUninterestingness, + std::unique_ptr Clone, TestRunner &Test, function_ref ExtractChunksFromModule, std::set &UninterestingChunks, std::vector &ChunksStillConsideredInteresting) { @@ -137,9 +146,6 @@ C != ChunkToCheckForUninterestingness; }); - // Clone module before hacking it up.. - std::unique_ptr Clone = - cloneReducerWorkItem(Test.getProgram()); // Generate Module with only Targets inside Current Chunks Oracle O(CurrentChunks); ExtractChunksFromModule(O, *Clone); @@ -169,6 +175,39 @@ return Clone; } +// Used to communicate when a task reduced a chunk. +static std::mutex AnyReducedMutex; +static bool AnyReduced = false; + +template +void ProcessChunkFromSerializedBitcode( + Chunk &ChunkToCheckForUninterestingness, TestRunner &Test, + function_ref ExtractChunksFromModule, + std::set &UninterestingChunks, + std::vector &ChunksStillConsideredInteresting, + SmallString<0> &OriginalBC, SmallString<0> &ResultBuffer) { + LLVMContext Ctx; + Expected> MOrErr = parseBitcodeFile( + MemoryBufferRef(StringRef(OriginalBC.data(), OriginalBC.size()), + ""), + Ctx); + if (!MOrErr) + report_fatal_error("Failed to read bitcode"); + auto CloneMMM = std::make_unique(); + CloneMMM->M = std::move(MOrErr.get()); + + if (std::unique_ptr ChunkResult = + CheckChunk(ChunkToCheckForUninterestingness, std::move(CloneMMM), + Test, ExtractChunksFromModule, UninterestingChunks, + ChunksStillConsideredInteresting)) { + raw_svector_ostream BCOS(ResultBuffer); + WriteBitcodeToFile(*ChunkResult->M, BCOS); + // Communicate that the task reduced a chunk. + std::unique_lock Lock(AnyReducedMutex); + AnyReduced = true; + } +} + /// Runs the Delta Debugging algorithm, splits the code into chunks and /// reduces the amount of chunks that are considered interesting by the /// given test. @@ -207,19 +246,116 @@ increaseGranularity(ChunksStillConsideredInteresting); } + NumJobs = std::min(unsigned(NumJobs), 32u); + ThreadPool ChunkThreadPool(hardware_concurrency(NumJobs)); + bool FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity; do { FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity = false; std::set UninterestingChunks; - for (Chunk &ChunkToCheckForUninterestingness : - reverse(ChunksStillConsideredInteresting)) { - std::unique_ptr Result = CheckChunk( - ChunkToCheckForUninterestingness, Test, ExtractChunksFromModule, - UninterestingChunks, ChunksStillConsideredInteresting); + + // When running with more than one thread, serialize the original bitcode + // to OriginalBC. + SmallString<0> OriginalBC; + if (NumJobs > 1) { + raw_svector_ostream BCOS(OriginalBC); + WriteBitcodeToFile(*Test.getProgram().M, BCOS); + } + + SmallVector> Results(NumJobs * 3, {}); + for (auto I = ChunksStillConsideredInteresting.rbegin(), + E = ChunksStillConsideredInteresting.rend(); + I != E; ++I) { + std::unique_ptr Result = nullptr; + unsigned WorkLeft = std::distance(I, E); + + // Run in parallel mode, if the user requested more than one thread and + // there are at least a few chunks to process. + if (NumJobs > 1 && WorkLeft > 1) { + unsigned NumInitialTasks = std::min(WorkLeft, unsigned(NumJobs) + 2); + unsigned NumChunksProcessed = 0; + + AnyReduced = false; + std::deque> TaskQueue; + // Process NumInitialTasks chunks in parallel using ChunkThreadPool. + // When the tasks are added to the pool, parse the original module from + // OriginalBC with a fresh LLVMContext object. This ensures that the + // cloned module of each task uses an independent LLVMContext object. If + // a task reduces the input, serialize the result back in the + // corresponding Result element. + for (unsigned J = 0; J < NumInitialTasks; ++J) { + Results[J] = ""; + TaskQueue.push_back(ChunkThreadPool.async( + [J, I, &Test, &ExtractChunksFromModule, &UninterestingChunks, + &ChunksStillConsideredInteresting, &OriginalBC, &Results]() { + ProcessChunkFromSerializedBitcode( + *(I + J), Test, ExtractChunksFromModule, + UninterestingChunks, ChunksStillConsideredInteresting, + OriginalBC, Results[J]); + })); + } + + // Start processing results of the queued tasks. We wait for the first + // task in the queue to finish. If it reduced a chunk, we parse the + // result and exit the loop. + // Otherwise we will try to schedule a new task, if + // * no other pending job reduced a chunk, + // * there is still space in the Results vector and + // * we have not reached the end of the chunk. + while (!TaskQueue.empty()) { + auto &F = TaskQueue.front(); + F.wait(); + TaskQueue.pop_front(); + + SmallString<0> &TaskResult = Results[NumChunksProcessed]; + NumChunksProcessed++; + if (TaskResult.empty()) { + unsigned NumScheduledTasks = NumChunksProcessed + TaskQueue.size(); + bool NotReduced = true; + { + std::unique_lock Lock(AnyReducedMutex); + NotReduced = !AnyReduced; + } + if (NotReduced && I + NumScheduledTasks != E && + NumScheduledTasks < Results.size()) { + Results[NumScheduledTasks] = ""; + Chunk &ChunkToCheck = *(I + NumScheduledTasks); + TaskQueue.push_back(ChunkThreadPool.async( + [NumScheduledTasks, &Test, &ExtractChunksFromModule, + &UninterestingChunks, &ChunksStillConsideredInteresting, + &OriginalBC, &Results, &ChunkToCheck]() { + ProcessChunkFromSerializedBitcode( + ChunkToCheck, Test, ExtractChunksFromModule, + UninterestingChunks, ChunksStillConsideredInteresting, + OriginalBC, Results[NumScheduledTasks]); + })); + } + continue; + } + + Expected> MOrErr = parseBitcodeFile( + MemoryBufferRef(StringRef(TaskResult.data(), TaskResult.size()), + ""), + Test.getProgram().M->getContext()); + if (!MOrErr) + report_fatal_error("Failed to read bitcode"); + Result = std::make_unique(); + Result->M = std::move(MOrErr.get()); + break; + } + // Forward I to the last chunk processed in parallel. + I += NumChunksProcessed - 1; + } else { + Result = CheckChunk(*I, cloneReducerWorkItem(Test.getProgram()), Test, + ExtractChunksFromModule, UninterestingChunks, + ChunksStillConsideredInteresting); + } + if (!Result) continue; + Chunk &ChunkToCheckForUninterestingness = *I; FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity = true; UninterestingChunks.insert(ChunkToCheckForUninterestingness); ReducedProgram = std::move(Result);