diff --git a/llvm/test/tools/llvm-reduce/operands-skip.ll b/llvm/test/tools/llvm-reduce/operands-skip.ll
--- a/llvm/test/tools/llvm-reduce/operands-skip.ll
+++ b/llvm/test/tools/llvm-reduce/operands-skip.ll
@@ -1,6 +1,13 @@
 ; RUN: llvm-reduce %s -o %t --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file
 ; RUN: FileCheck %s --input-file %t --check-prefixes=REDUCED
 
+; RUN: llvm-reduce -max-chunk-threads=2 %s -o %t.1 --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file
+; RUN: FileCheck %s --input-file %t.1 --check-prefixes=REDUCED
+
+; RUN: llvm-reduce -max-chunk-threads=4 %s -o %t.2 --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file
+; RUN: FileCheck %s --input-file %t.2 --check-prefixes=REDUCED
+
+
 ; INTERESTING: store i32 43, i32* {{(%imm|%indirect)}}, align 4
 ; REDUCED:     store i32 43, i32* %imm, align 4
 
diff --git a/llvm/tools/llvm-reduce/deltas/Delta.cpp b/llvm/tools/llvm-reduce/deltas/Delta.cpp
--- a/llvm/tools/llvm-reduce/deltas/Delta.cpp
+++ b/llvm/tools/llvm-reduce/deltas/Delta.cpp
@@ -15,8 +15,11 @@
 #include "Delta.h"
 #include "ReducerWorkItem.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include <fstream>
 #include <set>
@@ -31,6 +34,12 @@
     "starting-granularity-level",
     cl::desc("Number of times to divide chunks prior to first test"));
 
+static cl::opt<unsigned int> MaxChunkThreads(
+    "max-chunk-threads",
+    cl::desc("Maximum number of threads to use to process chunks. Set to 1 to "
+             "disables parallelism. Maximum capped to 32."),
+    cl::init(1));
+
 void writeOutput(ReducerWorkItem &M, llvm::StringRef Message);
 
 bool isReduced(ReducerWorkItem &M, TestRunner &Test,
@@ -135,6 +144,12 @@
     increaseGranularity(ChunksStillConsideredInteresting);
   }
 
+  MaxChunkThreads = std::min(unsigned(MaxChunkThreads), 32u);
+  ThreadPool ChunkThreadPool(hardware_concurrency(MaxChunkThreads));
+
+  unsigned NumTasks = MaxChunkThreads * 2;
+  SmallVector<SmallString<0>> Results(MaxChunkThreads * 2, {});
+
   bool FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity;
   do {
     FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity = false;
@@ -145,7 +160,8 @@
     // modified module if the chunk resulted in a reduction.
     auto CheckChunk = [&UninterestingChunks, &Test, &ExtractChunksFromModule,
                        &ChunksStillConsideredInteresting](
-                          Chunk &ChunkToCheckForUninterestingness)
+                          Chunk &ChunkToCheckForUninterestingness,
+                          std::unique_ptr<ReducerWorkItem> Clone)
         -> std::unique_ptr<ReducerWorkItem> {
       // Take all of ChunksStillConsideredInteresting chunks, except those we've
       // already deemed uninteresting (UninterestingChunks) but didn't remove
@@ -160,9 +176,6 @@
                        C != ChunkToCheckForUninterestingness;
               });
 
-      // Clone module before hacking it up..
-      std::unique_ptr<ReducerWorkItem> Clone =
-          cloneReducerWorkItem(Test.getProgram());
       // Generate Module with only Targets inside Current Chunks
       Oracle O(CurrentChunks);
       ExtractChunksFromModule(O, *Clone);
@@ -192,13 +205,84 @@
       return Clone;
     };
 
-    for (Chunk &ChunkToCheckForUninterestingness :
-         reverse(ChunksStillConsideredInteresting)) {
-      std::unique_ptr<ReducerWorkItem> Result =
-          CheckChunk(ChunkToCheckForUninterestingness);
+    // Disable parallel processing for MIR reduction for now.
+    if (Test.getProgram().MF)
+      MaxChunkThreads = 1;
+
+    // When running with more than one thread, serialize the original bitcode to
+    // OriginalBC.
+    SmallString<0> OriginalBC;
+    if (MaxChunkThreads > 1) {
+      raw_svector_ostream BCOS(OriginalBC);
+      WriteBitcodeToFile(*Test.getProgram().M, BCOS);
+    }
+
+    for (auto I = ChunksStillConsideredInteresting.rbegin(),
+              E = ChunksStillConsideredInteresting.rend();
+         I != E; ++I) {
+      std::unique_ptr<ReducerWorkItem> Result = nullptr;
+      unsigned WorkLeft = std::distance(I, E);
+      // Run in parallel mode, if the user requested more than one thread and
+      // there are at least a few chunks to process.
+      if (MaxChunkThreads > 1 && WorkLeft > 4) {
+        NumTasks = std::min(WorkLeft, unsigned(MaxChunkThreads));
+
+        // Process NumTasks chunks in parallel using ChunkThreadPool. When the
+        // tasks are added to the pool, parse the original module from
+        // OriginalBC with a fresh LLVMContext object. This ensures that the
+        // cloned module of each task uses an independent LLVMContext object. If
+        // a task reduces the input, serialize the result back in the
+        // corresponding Result element.
+        for (unsigned J = 0; J < NumTasks; ++J) {
+          Results[J] = "";
+          ChunkThreadPool.async([&Results, J, I, &CheckChunk, &OriginalBC]() {
+            LLVMContext Ctx;
+            Expected<std::unique_ptr<Module>> MOrErr = parseBitcodeFile(
+                MemoryBufferRef(StringRef(OriginalBC.data(), OriginalBC.size()),
+                                "<llvm-reduce tmp module>"),
+                Ctx);
+            if (!MOrErr)
+              report_fatal_error("Failed to read bitcode");
+            auto CloneMMM = std::make_unique<ReducerWorkItem>();
+            CloneMMM->M = std::move(MOrErr.get());
+
+            auto Res = CheckChunk(*(I + J), std::move(CloneMMM));
+            if (Res) {
+              raw_svector_ostream BCOS(Results[J]);
+              WriteBitcodeToFile(*Res->M, BCOS);
+            }
+          });
+        }
+        ChunkThreadPool.wait();
+
+        // Process the results after all jobs have finished. The first non-null
+        // result module is used. If multiple jobs successfully reduced a chunk,
+        // the other results are ignored.This ensures parallel processing
+        // produces the same results as serial processing.
+        for (unsigned J = 0; J < NumTasks; ++J) {
+          if (Results[J].empty())
+            continue;
+
+          Expected<std::unique_ptr<Module>> MOrErr = parseBitcodeFile(
+              MemoryBufferRef(StringRef(Results[J].data(), Results[J].size()),
+                              "<llvm-reduce tmp module>"),
+              Test.getProgram().M->getContext());
+          if (!MOrErr)
+            report_fatal_error("Failed to read bitcode");
+          Result = std::make_unique<ReducerWorkItem>();
+          Result->M = std::move(MOrErr.get());
+          I += J;
+          break;
+        }
+        if (!Result)
+          I += NumTasks - 1;
+      } else {
+        Result = CheckChunk(*I, cloneReducerWorkItem(Test.getProgram()));
+      }
       if (!Result)
         continue;
 
+      Chunk &ChunkToCheckForUninterestingness = *I;
       FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity = true;
       UninterestingChunks.insert(ChunkToCheckForUninterestingness);
       ReducedProgram = std::move(Result);