diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h
--- a/lld/MachO/SyntheticSections.h
+++ b/lld/MachO/SyntheticSections.h
@@ -531,13 +531,19 @@
 
 class DeduplicatedCStringSection final : public CStringSection {
 public:
-  DeduplicatedCStringSection();
-  uint64_t getSize() const override { return builder.getSize(); }
+  uint64_t getSize() const override { return size; }
   void finalizeContents() override;
-  void writeTo(uint8_t *buf) const override { builder.write(buf); }
+  void writeTo(uint8_t *buf) const override;
 
 private:
-  llvm::StringTableBuilder builder;
+  struct StringOffset {
+    uint8_t trailingZeros;
+    uint64_t outSecOff = UINT64_MAX;
+
+    explicit StringOffset(uint8_t zeros) : trailingZeros(zeros) {}
+  };
+  llvm::DenseMap<llvm::CachedHashStringRef, StringOffset> stringOffsetMap;
+  size_t size = 0;
 };
 
 /*
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -1341,7 +1341,10 @@
     for (size_t i = 0, e = isec->pieces.size(); i != e; ++i) {
       if (!isec->pieces[i].live)
         continue;
-      uint32_t pieceAlign = MinAlign(isec->pieces[i].inSecOff, align);
+      // See comment above DeduplicatedCStringSection for how alignment is
+      // handled.
+      uint32_t pieceAlign =
+          1 << countTrailingZeros(isec->align | isec->pieces[i].inSecOff);
       offset = alignTo(offset, pieceAlign);
       isec->pieces[i].outSecOff = offset;
       isec->isFinal = true;
@@ -1351,45 +1354,89 @@
   }
   size = offset;
 }
+
 // Mergeable cstring literals are found under the __TEXT,__cstring section. In
 // contrast to ELF, which puts strings that need different alignments into
 // different sections, clang's Mach-O backend puts them all in one section.
 // Strings that need to be aligned have the .p2align directive emitted before
-// them, which simply translates into zero padding in the object file.
+// them, which simply translates into zero padding in the object file. In other
+// words, we have to infer the desired alignment of these cstrings from their
+// addresses.
 //
-// I *think* ld64 extracts the desired per-string alignment from this data by
-// preserving each string's offset from the last section-aligned address. I'm
-// not entirely certain since it doesn't seem consistent about doing this, and
-// in fact doesn't seem to be correct in general: we can in fact can induce ld64
-// to produce a crashing binary just by linking in an additional object file
-// that only contains a duplicate cstring at a different alignment. See PR50563
-// for details.
+// We differ slightly from ld64 in how we've chosen to align these cstrings.
+// Both LLD and ld64 preserve the number of trailing zeros in each cstring's
+// address in the input object files. When deduplicating identical cstrings,
+// both linkers pick the cstring whose address has more trailing zeros, and
+// preserve the alignment of that address in the final binary. However, ld64
+// goes a step further and also preserves the offset of the cstring from the
+// last section-aligned address.  I.e. if a cstring is at offset 18 in the
+// input, with a section alignment of 16, then both LLD and ld64 will ensure the
+// final address is 2-byte aligned (since 18 == 16 + 2). But ld64 will also
+// ensure that the final address is of the form 16 * k + 2 for some k.
 //
-// On x86_64, the cstrings we've seen so far that require special alignment are
-// all accessed by SIMD operations -- x86_64 requires SIMD accesses to be
-// 16-byte-aligned. arm64 also seems to require 16-byte-alignment in some cases
-// (PR50791), but I haven't tracked down the root cause. So for now, I'm just
-// aligning all strings to 16 bytes.  This is indeed wasteful, but
-// implementation-wise it's simpler than preserving per-string
-// alignment+offsets. It also avoids the aforementioned crash after
-// deduplication of differently-aligned strings.  Finally, the overhead is not
-// huge: using 16-byte alignment (vs no alignment) is only a 0.5% size overhead
-// when linking chromium_framework on x86_64.
-DeduplicatedCStringSection::DeduplicatedCStringSection()
-    : builder(StringTableBuilder::RAW, /*Alignment=*/16) {}
-
+// Note that ld64's heuristic means that a dedup'ed cstring's final address is
+// dependent on the order of the input object files. E.g. if in addition to the
+// cstring at offset 18 above, we have a duplicate one in another file with a
+// `.cstring` section alignment of 2 and an offset of zero, then ld64 will pick
+// the cstring from the object file earlier on the command line (since both have
+// the same number of trailing zeros in their address). So the final cstring may
+// either be at some address `16 * k + 2` or at some address `2 * k`.
+//
+// I've opted not to follow this behavior primarily for implementation
+// simplicity, and secondarily to save a few more bytes. It's not clear to me
+// that preserving the section alignment + offset is ever necessary, and there
+// are many cases that are clearly redundant. In particular, if an x86_64 object
+// file contains some strings that are accessed via SIMD instructions, then the
+// .cstring section in the object file will be 16-byte-aligned (since SIMD
+// requires its operand addresses to be 16-byte aligned). However, there will
+// typically also be other cstrings in the same file that aren't used via SIMD
+// and don't need this alignment. They will be emitted at some arbitrary address
+// `A`, but ld64 will treat them as being 16-byte aligned with an offset of `16
+// % A`.
 void DeduplicatedCStringSection::finalizeContents() {
-  // Add all string pieces to the string table builder to create section
-  // contents.
+  // Find the largest alignment required for each string.
+  for (const CStringInputSection *isec : inputs) {
+    for (size_t i = 0, e = isec->pieces.size(); i != e; ++i) {
+      const StringPiece &piece = isec->pieces[i];
+      if (!piece.live)
+        continue;
+      auto s = isec->getCachedHashStringRef(i);
+      assert(isec->align != 0);
+      uint8_t trailingZeros = countTrailingZeros(isec->align | piece.inSecOff);
+      auto it = stringOffsetMap.insert(
+          std::make_pair(s, StringOffset(trailingZeros)));
+      if (!it.second && it.first->second.trailingZeros < trailingZeros)
+        it.first->second.trailingZeros = trailingZeros;
+    }
+  }
+
+  // Assign an offset for each string and save it to the corresponding
+  // StringPieces for easy access.
   for (CStringInputSection *isec : inputs) {
-    for (size_t i = 0, e = isec->pieces.size(); i != e; ++i)
-      if (isec->pieces[i].live)
-        isec->pieces[i].outSecOff =
-            builder.add(isec->getCachedHashStringRef(i));
+    for (size_t i = 0, e = isec->pieces.size(); i != e; ++i) {
+      if (!isec->pieces[i].live)
+        continue;
+      auto s = isec->getCachedHashStringRef(i);
+      auto it = stringOffsetMap.find(s);
+      assert(it != stringOffsetMap.end());
+      StringOffset &offsetInfo = it->second;
+      if (offsetInfo.outSecOff == UINT64_MAX) {
+        offsetInfo.outSecOff = alignTo(size, 1 << offsetInfo.trailingZeros);
+        size = offsetInfo.outSecOff + s.size();
+      }
+      isec->pieces[i].outSecOff = offsetInfo.outSecOff;
+    }
     isec->isFinal = true;
   }
+}
 
-  builder.finalizeInOrder();
+void DeduplicatedCStringSection::writeTo(uint8_t *buf) const {
+  for (const auto &p : stringOffsetMap) {
+    StringRef data = p.first.val();
+    uint64_t off = p.second.outSecOff;
+    if (!data.empty())
+      memcpy(buf + off, data.data(), data.size());
+  }
 }
 
 // This section is actually emitted as __TEXT,__const by ld64, but clang may
diff --git a/lld/MachO/ld64-vs-lld.rst b/lld/MachO/ld64-vs-lld.rst
--- a/lld/MachO/ld64-vs-lld.rst
+++ b/lld/MachO/ld64-vs-lld.rst
@@ -13,6 +13,12 @@
 occurring. In particular, programs which compare string literals via pointer
 equality must be fixed to use value equality instead.
 
+String Alignment
+****************
+LLD is slightly less conservative about aligning cstrings, allowing it to pack
+them more compactly. This should not result in any meaningful semantic
+difference.
+
 ``-no_deduplicate`` Flag
 **********************
 - LD64:
diff --git a/lld/test/MachO/cstring-align.s b/lld/test/MachO/cstring-align.s
new file mode 100644
--- /dev/null
+++ b/lld/test/MachO/cstring-align.s
@@ -0,0 +1,133 @@
+# REQUIRES: x86
+# RUN: rm -rf %t; split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/align-empty.s -o %t/align-empty.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/align-4-0.s -o %t/align-4-0.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/align-4-2.s -o %t/align-4-2.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/align-16-0.s -o %t/align-16-0.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/align-16-2.s -o %t/align-16-2.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/align-16-4.s -o %t/align-16-4.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/align-16-8.s -o %t/align-16-8.o
+
+## Check that we preserve the alignment of cstrings. Alignment is determined
+## not by section alignment but by the number of trailing zeros of the cstring's
+## address in the input object file.
+
+## The non-dedup case is not particularly interesting since the null bytes don't
+## get dedup'ed, meaning that the output strings get their offsets "naturally"
+## preserved.
+
+# RUN: %lld -dylib %t/align-empty.o %t/align-4-0.o %t/align-16-0.o -o %t/align-4-0-16-0
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/align-4-0-16-0 | \
+# RUN:   FileCheck %s -D#OFF1=4 -D#OFF2=16
+# RUN: %lld -dylib %t/align-empty.o %t/align-16-0.o %t/align-4-0.o -o %t/align-16-0-4-0
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/align-16-0-4-0 | \
+# RUN:   FileCheck %s -D#OFF1=16 -D#OFF2=20
+
+# RUN: %lld -dylib %t/align-empty.o %t/align-4-2.o %t/align-16-0.o -o %t/align-4-2-16-0
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/align-4-2-16-0 | \
+# RUN:   FileCheck %s -D#OFF1=6 -D#OFF2=16
+# RUN: %lld -dylib %t/align-empty.o %t/align-16-0.o %t/align-4-2.o -o %t/align-16-0-4-2
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/align-16-0-4-2 | \
+# RUN:   FileCheck %s -D#OFF1=16 -D#OFF2=22
+
+# RUN: %lld -dylib %t/align-empty.o %t/align-4-0.o %t/align-16-2.o -o %t/align-4-0-16-2
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/align-4-0-16-2 | \
+# RUN:   FileCheck %s -D#OFF1=4 -D#OFF2=18
+# RUN: %lld -dylib %t/align-empty.o %t/align-16-2.o %t/align-4-0.o -o %t/align-16-2-4-0
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/align-16-2-4-0 | \
+# RUN:   FileCheck %s -D#OFF1=18 -D#OFF2=20
+
+# CHECK:       Contents of (__TEXT,__cstring) section
+# CHECK-NEXT:  [[#%.16x,START:]]     {{$}}
+# CHECK:       [[#%.16x,START+OFF1]] a{{$}}
+# CHECK:       [[#%.16x,START+OFF2]] a{{$}}
+# CHECK-EMPTY:
+
+## The dedup cases are more interesting...
+
+## Same offset, different alignments => pick higher alignment
+# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-4-0.o %t/align-16-0.o -o %t/dedup-4-0-16-0
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-4-0-16-0 | \
+# RUN:   FileCheck %s --check-prefix=DEDUP -D#OFF=16
+# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-16-0.o %t/align-4-0.o -o %t/dedup-16-0-4-0
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-16-0-4-0 | \
+# RUN:   FileCheck %s --check-prefix=DEDUP -D#OFF=16
+
+## 16 byte alignment vs 2 byte offset => align to 16 bytes
+# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-4-2.o %t/align-16-0.o -o %t/dedup-4-2-16-0
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-4-2-16-0 | \
+# RUN:   FileCheck %s --check-prefix=DEDUP -D#OFF=16
+# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-16-0.o %t/align-4-2.o -o %t/dedup-16-0-4-2
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-16-0-4-2 | \
+# RUN:   FileCheck %s --check-prefix=DEDUP -D#OFF=16
+
+## 4 byte alignment vs 2 byte offset => align to 4 bytes
+# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-4-0.o %t/align-16-2.o -o %t/dedup-4-0-16-2
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-4-0-16-2 | \
+# RUN:   FileCheck %s --check-prefix=DEDUP -D#OFF=4
+# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-16-2.o %t/align-4-0.o -o %t/dedup-16-2-4-0
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-16-2-4-0 | \
+# RUN:   FileCheck %s --check-prefix=DEDUP -D#OFF=4
+
+## Both inputs are 4-byte aligned, one via offset and the other via section alignment
+# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-4-0.o %t/align-16-4.o -o %t/dedup-4-0-16-4
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-4-0-16-4 | \
+# RUN:   FileCheck %s --check-prefix=DEDUP -D#OFF=4
+# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-16-4.o %t/align-4-0.o -o %t/dedup-16-4-4-0
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-16-4-4-0 | \
+# RUN:   FileCheck %s --check-prefix=DEDUP -D#OFF=4
+
+## 8-byte offset vs 4-byte section alignment => align to 8 bytes
+# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-4-0.o %t/align-16-8.o -o %t/dedup-4-0-16-8
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-4-0-16-8 | \
+# RUN:   FileCheck %s --check-prefix=DEDUP -D#OFF=8
+# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-16-8.o %t/align-4-0.o -o %t/dedup-16-8-4-0
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-16-8-4-0 | \
+# RUN:   FileCheck %s --check-prefix=DEDUP -D#OFF=8
+
+# DEDUP:       Contents of (__TEXT,__cstring) section
+# DEDUP-NEXT:  [[#%.16x,START:]]    {{$}}
+# DEDUP:       [[#%.16x,START+OFF]] a{{$}}
+# DEDUP-EMPTY:
+
+#--- align-empty.s
+## We use this file to create an empty string at the start of every output
+## file's .cstring section. This makes the test cases more interesting since LLD
+## can't place the string "a" at the trivially-aligned zero offset.
+.cstring
+.p2align 2
+.asciz ""
+
+#--- align-4-0.s
+.cstring
+.p2align 2
+.asciz "a"
+
+#--- align-4-2.s
+.cstring
+.p2align 2
+.zero 0x2
+.asciz "a"
+
+#--- align-16-0.s
+.cstring
+.p2align 4
+.asciz "a"
+
+#--- align-16-2.s
+.cstring
+.p2align 4
+.zero 0x2
+.asciz "a"
+
+#--- align-16-4.s
+.cstring
+.p2align 4
+.zero 0x4
+.asciz "a"
+
+#--- align-16-8.s
+.cstring
+.p2align 4
+.zero 0x8
+.asciz "a"
diff --git a/lld/test/MachO/cstring-dedup.s b/lld/test/MachO/cstring-dedup.s
--- a/lld/test/MachO/cstring-dedup.s
+++ b/lld/test/MachO/cstring-dedup.s
@@ -8,12 +8,11 @@
 # RUN: llvm-objdump --macho --section="__DATA,ptrs" --syms %t/test | FileCheck %s
 # RUN: llvm-readobj --section-headers %t/test | FileCheck %s --check-prefix=HEADER
 
-## Make sure we only have 3 deduplicated strings in __cstring, and that they
-## are 16-byte-aligned.
+## Make sure we only have 3 deduplicated strings in __cstring.
 # STR: Contents of (__TEXT,__cstring) section
-# STR: {{.*}}0 foo
-# STR: {{.*}}0 barbaz
-# STR: {{.*}}0 {{$}}
+# STR: {{[[:xdigit:]]+}} foo
+# STR: {{[[:xdigit:]]+}} barbaz
+# STR: {{[[:xdigit:]]+}} {{$}}
 
 ## Make sure both symbol and section relocations point to the right thing.
 # CHECK:      Contents of (__DATA,ptrs) section