diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt
--- a/lld/ELF/CMakeLists.txt
+++ b/lld/ELF/CMakeLists.txt
@@ -2,6 +2,10 @@
 tablegen(LLVM Options.inc -gen-opt-parser-defs)
 add_public_tablegen_target(ELFOptionsTableGen)
 
+if(LLVM_ENABLE_ZLIB)
+  set(imported_libs ZLIB::ZLIB)
+endif()
+
 add_lld_library(lldELF
   AArch64ErrataFix.cpp
   Arch/AArch64.cpp
@@ -58,6 +62,7 @@
 
   LINK_LIBS
   lldCommon
+  ${imported_libs}
   ${LLVM_PTHREAD_LIB}
 
   DEPENDS
diff --git a/lld/ELF/OutputSections.h b/lld/ELF/OutputSections.h
--- a/lld/ELF/OutputSections.h
+++ b/lld/ELF/OutputSections.h
@@ -25,6 +25,12 @@
 class InputSection;
 class InputSectionBase;
 
+struct CompressedData {
+  std::unique_ptr<SmallVector<uint8_t, 0>[]> shards;
+  uint32_t numShards = 0;
+  uint32_t checksum = 0;
+};
+
 // This represents a section in an output file.
 // It is composed of multiple InputSections.
 // The writer creates multiple OutputSections and assign them unique,
@@ -113,7 +119,7 @@
 private:
   // Used for implementation of --compress-debug-sections option.
   SmallVector<uint8_t, 0> zDebugHeader;
-  SmallVector<char, 0> compressedData;
+  CompressedData compressed;
 
   std::array<uint8_t, 4> getFiller();
 };
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -15,7 +15,7 @@
 #include "lld/Common/Memory.h"
 #include "lld/Common/Strings.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/Support/Compression.h"
+#include "llvm/Config/config.h" // LLVM_ENABLE_ZLIB
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Parallel.h"
@@ -23,6 +23,9 @@
 #include "llvm/Support/TimeProfiler.h"
 #include <regex>
 #include <unordered_set>
+#if LLVM_ENABLE_ZLIB
+#include <zlib.h>
+#endif
 
 using namespace llvm;
 using namespace llvm::dwarf;
@@ -284,13 +287,45 @@
   memcpy(buf + i, filler.data(), size - i);
 }
 
+#if LLVM_ENABLE_ZLIB
+static SmallVector<uint8_t, 0> deflateShard(ArrayRef<uint8_t> in, int level,
+                                            int flush) {
+  // 15 and 8 are default. windowBits=-15 is negative to generate raw deflate
+  // data with no zlib header or trailer.
+  z_stream s = {};
+  deflateInit2(&s, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
+  s.next_in = const_cast<uint8_t *>(in.data());
+  s.avail_in = in.size();
+
+  // Allocate a buffer of half of the input size, and grow it by 1.5x if
+  // insufficient.
+  SmallVector<uint8_t, 0> out;
+  size_t pos = 0;
+  out.resize_for_overwrite(std::max<size_t>(in.size() / 2, 64));
+  do {
+    if (pos == out.size())
+      out.resize_for_overwrite(out.size() * 3 / 2);
+    s.next_out = out.data() + pos;
+    s.avail_out = out.size() - pos;
+    (void)deflate(&s, flush);
+    pos = s.next_out - out.data();
+  } while (s.avail_out == 0);
+  assert(s.avail_in == 0);
+
+  out.truncate(pos);
+  deflateEnd(&s);
+  return out;
+}
+#endif
+
 // Compress section contents if this section contains debug info.
 template <class ELFT> void OutputSection::maybeCompress() {
+#if LLVM_ENABLE_ZLIB
   using Elf_Chdr = typename ELFT::Chdr;
 
   // Compress only DWARF debug sections.
   if (!config->compressDebugSections || (flags & SHF_ALLOC) ||
-      !name.startswith(".debug_"))
+      !name.startswith(".debug_") || size == 0)
     return;
 
   llvm::TimeTraceScope timeScope("Compress debug sections");
@@ -309,13 +344,42 @@
   // -O2 is given, we use level 6 to compress debug info more by ~15%. We found
   // that level 7 to 9 doesn't make much difference (~1% more compression) while
   // they take significant amount of time (~2x), so level 6 seems enough.
-  if (Error e = zlib::compress(toStringRef(buf), compressedData,
-                               config->optimize >= 2 ? 6 : 1))
-    fatal("compress failed: " + llvm::toString(std::move(e)));
+  const int level = config->optimize >= 2 ? 6 : Z_BEST_SPEED;
+
+  // Split input into 1-MiB shards.
+  constexpr size_t shardSize = 1 << 20;
+  const size_t numShards = (size + shardSize - 1) / shardSize;
+  auto shardsIn = std::make_unique<ArrayRef<uint8_t>[]>(numShards);
+  for (size_t i = 0, start = 0, end; start != buf.size(); ++i, start = end) {
+    end = std::min(start + shardSize, buf.size());
+    shardsIn[i] = makeArrayRef<uint8_t>(buf.data() + start, end - start);
+  }
+
+  // Compress shards and compute Alder-32 checksums. Use Z_SYNC_FLUSH for all
+  // shards but the last to flush the output to a byte boundary to be
+  // concatenated with the next shard.
+  auto shardsOut = std::make_unique<SmallVector<uint8_t, 0>[]>(numShards);
+  auto shardsAdler = std::make_unique<uint32_t[]>(numShards);
+  parallelForEachN(0, numShards, [&](size_t i) {
+    shardsOut[i] = deflateShard(shardsIn[i], level,
+                                i != numShards - 1 ? Z_SYNC_FLUSH : Z_FINISH);
+    shardsAdler[i] = adler32(1, shardsIn[i].data(), shardsIn[i].size());
+  });
+
+  // Update section size and combine Alder-32 checksums.
+  uint32_t checksum = 1;       // Initial Adler-32 value
+  size = sizeof(Elf_Chdr) + 2; // Elf_Chdir and zlib header
+  for (size_t i = 0; i != numShards; ++i) {
+    size += shardsOut[i].size();
+    checksum = adler32_combine(checksum, shardsAdler[i], shardsIn[i].size());
+  }
+  size += 4; // checksum
 
-  // Update section headers.
-  size = sizeof(Elf_Chdr) + compressedData.size();
+  compressed.shards = std::move(shardsOut);
+  compressed.numShards = numShards;
+  compressed.checksum = checksum;
   flags |= SHF_COMPRESSED;
+#endif
 }
 
 static void writeInt(uint8_t *buf, uint64_t data, uint64_t size) {
@@ -339,10 +403,25 @@
   // If --compress-debug-section is specified and if this is a debug section,
   // we've already compressed section contents. If that's the case,
   // just write it down.
-  if (!compressedData.empty()) {
+  if (compressed.shards) {
     memcpy(buf, zDebugHeader.data(), zDebugHeader.size());
-    memcpy(buf + zDebugHeader.size(), compressedData.data(),
-           compressedData.size());
+    buf += zDebugHeader.size();
+    size -= zDebugHeader.size();
+
+    // Compute shard offsets.
+    auto offsets = std::make_unique<size_t[]>(compressed.numShards);
+    offsets[0] = 2; // zlib header
+    for (size_t i = 1; i != compressed.numShards; ++i)
+      offsets[i] = offsets[i - 1] + compressed.shards[i - 1].size();
+
+    buf[0] = 0x78; // CMF
+    buf[1] = 0x01; // FLG: best speed
+    parallelForEachN(0, compressed.numShards, [&](size_t i) {
+      memcpy(buf + offsets[i], compressed.shards[i].data(),
+             compressed.shards[i].size());
+    });
+
+    write32be(buf + size - 4, compressed.checksum);
     return;
   }