diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -563,6 +563,8 @@
 /// Represents required padding such that a particular other set of fragments
 /// does not cross a particular power-of-two boundary. The other fragments must
 /// follow this one within the same section.
+/// If AvoidEndAlign is set, this fragment will emit a minimum size nop to
+/// prevent the fragment following it from ending at a given \p AlignBoundary.
 class MCBoundaryAlignFragment : public MCFragment {
   /// The alignment requirement of the branch to be aligned.
   Align AlignBoundary;
@@ -571,6 +573,9 @@
   /// The size of the fragment.  The size is lazily set during relaxation, and
   /// is not meaningful before that.
   uint64_t Size = 0;
+  /// Whether this fragment pads the subsequent fragment to prevent it from
+  /// ending at AlignBoundary.
+  bool IsAvoidEndAlign = false;
 
 public:
   MCBoundaryAlignFragment(Align AlignBoundary, MCSection *Sec = nullptr)
@@ -589,6 +594,9 @@
     LastFragment = F;
   }
 
+  bool isAvoidEndAlign() const { return IsAvoidEndAlign; }
+  void setAvoidEndAlign(bool V) { IsAvoidEndAlign = V; }
+
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_BoundaryAlign;
   }
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -1083,14 +1083,26 @@
 
   uint64_t AlignedOffset = Layout.getFragmentOffset(&BF);
   uint64_t AlignedSize = 0;
-  for (const MCFragment *F = BF.getLastFragment(); F != &BF;
-       F = F->getPrevNode())
-    AlignedSize += computeFragmentSize(Layout, *F);
-
+  uint64_t NewSize = 0;
   Align BoundaryAlignment = BF.getAlignment();
-  uint64_t NewSize = needPadding(AlignedOffset, AlignedSize, BoundaryAlignment)
-                         ? offsetToAlignment(AlignedOffset, BoundaryAlignment)
-                         : 0U;
+
+  if (BF.isAvoidEndAlign()) {
+    // Get fragment size for the fragment following this BoundaryAlign.
+    const MCFragment *NF = BF.getNextNode();
+    AlignedSize = computeFragmentSize(Layout, *NF);
+
+    // Pad with a minimum size nop.
+    if (isAgainstBoundary(AlignedOffset, AlignedSize, BoundaryAlignment))
+      NewSize = getBackend().getMinimumNopSize();
+  } else {
+    for (const MCFragment *F = BF.getLastFragment(); F != &BF;
+         F = F->getPrevNode())
+      AlignedSize += computeFragmentSize(Layout, *F);
+
+    if (needPadding(AlignedOffset, AlignedSize, BoundaryAlignment))
+      NewSize = offsetToAlignment(AlignedOffset, BoundaryAlignment);
+  }
+
   if (NewSize == BF.getSize())
     return false;
   BF.setSize(NewSize);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -75,6 +75,13 @@
 
 X86AlignBranchKind X86AlignBranchKindLoc;
 
+cl::opt<bool> X86AlignForMacroFusion(
+    "x86-align-for-macrofusion", cl::init(false),
+    cl::desc(
+        "Align macro-fusion pairs to avoid 64B boundary falling between "
+        "the instructions. May break assumptions about labels corresponding "
+        "to particular instructions, and should be used with caution."));
+
 cl::opt<unsigned> X86AlignBranchBoundary(
     "x86-align-branch-boundary", cl::init(0),
     cl::desc(
@@ -145,15 +152,25 @@
   X86AsmBackend(const Target &T, const MCSubtargetInfo &STI)
       : MCAsmBackend(support::little), STI(STI),
         MCII(T.createMCInstrInfo()) {
+    if (X86AlignForMacroFusion) {
+      AlignBoundary = assumeAligned(64);
+      AlignBranchType.addKind(X86::AlignBranchFused);
+      AlignBranchType.addKind(X86::AlignBranchJcc);
+    }
     if (X86AlignBranchWithin32BBoundaries) {
       // At the moment, this defaults to aligning fused branches, unconditional
       // jumps, and (unfused) conditional jumps with nops.  Both the
       // instructions aligned and the alignment method (nop vs prefix) may
       // change in the future.
-      AlignBoundary = assumeAligned(32);;
+      AlignBoundary = assumeAligned(32);
       AlignBranchType.addKind(X86::AlignBranchFused);
       AlignBranchType.addKind(X86::AlignBranchJcc);
       AlignBranchType.addKind(X86::AlignBranchJmp);
+      if (X86AlignForMacroFusion) {
+        // X86AlignBranchWithin32BBoundaries provides a stronger alignment restriction:
+        // that fused pairs don't cross 32B boundary. Turn X86AlignForMacroFusion off.
+        X86AlignForMacroFusion = false;
+      }
     }
     // Allow overriding defaults set by master flag
     if (X86AlignBranchBoundary.getNumOccurrences())
@@ -615,7 +632,7 @@
     // Macro fusion actually happens and there is no other fragment inserted
     // after the previous instruction.
     //
-    // Do nothing here since we already inserted a BoudaryAlign fragment when
+    // Do nothing here since we already inserted a BoundaryAlign fragment when
     // we met the first instruction in the fused pair and we'll tie them
     // together in emitInstructionEnd.
     //
@@ -633,11 +650,14 @@
     return;
   }
 
-  if (needAlign(Inst) || ((AlignBranchType & X86::AlignBranchFused) &&
-                          isFirstMacroFusibleInst(Inst, *MCII))) {
+  bool IsBranchFused = (AlignBranchType & X86::AlignBranchFused) &&
+                       isFirstMacroFusibleInst(Inst, *MCII);
+  if (needAlign(Inst) || IsBranchFused) {
     // If we meet a unfused branch or the first instuction in a fusiable pair,
     // insert a BoundaryAlign fragment.
     OS.insert(PendingBA = new MCBoundaryAlignFragment(AlignBoundary));
+    if (X86AlignForMacroFusion && IsBranchFused)
+      PendingBA->setAvoidEndAlign(true);
   }
 }
 
@@ -655,7 +675,7 @@
   if (!needAlign(Inst) || !PendingBA)
     return;
 
-  // Tie the aligned instructions into a a pending BoundaryAlign.
+  // Tie the aligned instructions into a pending BoundaryAlign.
   PendingBA->setLastFragment(CF);
   PendingBA = nullptr;
 
diff --git a/llvm/test/MC/X86/auto-mf-align.s b/llvm/test/MC/X86/auto-mf-align.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/X86/auto-mf-align.s
@@ -0,0 +1,32 @@
+# RUN: llvm-mc -triple=x86_64 -x86-align-for-macrofusion %s -filetype=obj | llvm-objdump --no-show-raw-insn -d - | FileCheck %s
+
+# no padding is expected since test doesn't end at alignment boundary:
+# CHECK-NOT: nop
+  testl %eax, %eax
+# CHECK: testl %eax, %eax
+  je  .LBB0
+
+.nops 57
+  int3
+# BoundaryAlign followed by MCDataFragment:
+# inserts nop because `test` would end at alignment boundary:
+# CHECK: 			3e: nop
+  testl %eax, %eax
+# CHECK-NEXT: 3f: testl %eax, %eax
+  je  .LBB0
+# CHECK-NEXT: 41: je
+.LBB0:
+  retq
+
+.p2align 6
+.L0:
+.nops 57
+  int3
+# BoundaryAlign followed by RelaxableFragment:
+# CHECK: 			ba: nop
+  cmpl $(.L1-.L0), %eax
+# CHECK-NEXT: bb: cmpl
+  je  .L0
+# CHECK-NEXT: c1: je
+.nops 65
+.L1: