Index: llvm/include/llvm/MC/MCFragment.h
===================================================================
--- llvm/include/llvm/MC/MCFragment.h
+++ llvm/include/llvm/MC/MCFragment.h
@@ -519,35 +519,34 @@
 
 /// Represents required padding such that a particular other set of fragments
 /// does not cross a particular power-of-two boundary. The other fragments must
-/// follow this one within the same section.
+/// follow this one within the same section.  The set of aligned fragments is
+/// (ThisFragment, LastFragment].  That is, the range starting at the fragment
+/// following this one and stopping at LastFragment (inclusive).
 class MCBoundaryAlignFragment : public MCFragment {
-  /// The alignment requirement of the branch to be aligned.
-  Align AlignBoundary;
-  /// Flag to indicate whether the branch is fused.  Use in determining the
-  /// region of fragments being aligned.
-  bool Fused : 1;
-  /// Flag to indicate whether NOPs should be emitted.
-  bool EmitNops : 1;
+  /// The last fragment in the range of fragments being aligned.  The range of
+  /// instructions being aligned is defined as (this, LastFragment].
+  MCFragment *LastFragment = nullptr;
   /// The size of the fragment.  The size is lazily set during relaxation, and
   /// is not meaningful before that.
-  uint64_t Size = 0;
+  uint16_t Size = 0;
+  /// The alignment requirement of the branch to be aligned.
+  Align AlignBoundary;
 
 public:
-  MCBoundaryAlignFragment(Align AlignBoundary, bool Fused = false,
-                          bool EmitNops = false, MCSection *Sec = nullptr)
-      : MCFragment(FT_BoundaryAlign, false, Sec), AlignBoundary(AlignBoundary),
-        Fused(Fused), EmitNops(EmitNops) {}
+  MCBoundaryAlignFragment(Align AlignBoundary, MCSection *Sec = nullptr)
+    : MCFragment(FT_BoundaryAlign, false, Sec), AlignBoundary(AlignBoundary)
+  {}
 
   uint64_t getSize() const { return Size; }
-  void setSize(uint64_t Value) { Size = Value; }
+  void setSize(uint64_t Value) {
+    assert(Value == (uint64_t)((uint16_t)Value));
+    Size = Value;
+  }
 
   Align getAlignment() const { return AlignBoundary; }
 
-  bool isFused() const { return Fused; }
-  void setFused(bool Value) { Fused = Value; }
-
-  bool canEmitNops() const { return EmitNops; }
-  void setEmitNops(bool Value) { EmitNops = Value; }
+  MCFragment *getLastFragment() const { return LastFragment; }
+  void setLastFragment(MCFragment *F) { LastFragment = F; }
 
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_BoundaryAlign;
Index: llvm/lib/MC/MCAssembler.cpp
===================================================================
--- llvm/lib/MC/MCAssembler.cpp
+++ llvm/lib/MC/MCAssembler.cpp
@@ -986,20 +986,22 @@
 
 bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout,
                                      MCBoundaryAlignFragment &BF) {
-  // The MCBoundaryAlignFragment that doesn't emit NOP should not be relaxed.
-  if (!BF.canEmitNops())
+  // If the last fragment was never set, then there's nothing to align.
+  if (!BF.getLastFragment())
     return false;
 
   uint64_t AlignedOffset = Layout.getFragmentOffset(BF.getNextNode());
+  // Compute the size of all the instructions in the range we're trying to
+  // align.  The range is [BF.getNextNode, LastNode].  
   uint64_t AlignedSize = 0;
-  const MCFragment *F = BF.getNextNode();
-  // If the branch is unfused, it is emitted into one fragment, otherwise it is
-  // emitted into two fragments at most, the next MCBoundaryAlignFragment(if
-  // exists) also marks the end of the branch.
-  for (auto i = 0, N = BF.isFused() ? 2 : 1;
-       i != N && !isa<MCBoundaryAlignFragment>(F); ++i, F = F->getNextNode()) {
+  for (const MCFragment *F = BF.getNextNode(), *Last = BF.getLastFragment(); ;
+       F = F->getNextNode()) {
     AlignedSize += computeFragmentSize(Layout, *F);
+    if (F == Last)
+      break;
   }
+  assert(AlignedSize <= BF.getAlignment().value() &&
+         "can't align a region larger than the alignment");
   uint64_t OldSize = BF.getSize();
   AlignedOffset -= OldSize;
   Align BoundaryAlignment = BF.getAlignment();
Index: llvm/lib/MC/MCFragment.cpp
===================================================================
--- llvm/lib/MC/MCFragment.cpp
+++ llvm/lib/MC/MCFragment.cpp
@@ -424,15 +424,9 @@
   }
   case MCFragment::FT_BoundaryAlign: {
     const auto *BF = cast<MCBoundaryAlignFragment>(this);
-    if (BF->canEmitNops())
-      OS << " (can emit nops to align";
-    if (BF->isFused())
-      OS << " fused branch)";
-    else
-      OS << " unfused branch)";
-    OS << "\n       ";
     OS << " BoundarySize:" << BF->getAlignment().value()
-       << " Size:" << BF->getSize();
+       << " Size:" << BF->getSize()
+       << " LastFragment:" << BF->getLastFragment();
     break;
   }
   case MCFragment::FT_SymbolId: {
Index: llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
===================================================================
--- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -152,9 +152,9 @@
 
   bool needAlign(MCObjectStreamer &OS) const;
   bool needAlignInst(const MCInst &Inst) const;
-  MCBoundaryAlignFragment *
-  getOrCreateBoundaryAlignFragment(MCObjectStreamer &OS) const;
+
   MCInst PrevInst;
+  MCBoundaryAlignFragment *PendingBA = nullptr;
 
 public:
   X86AsmBackend(const Target &T, const MCSubtargetInfo &STI)
@@ -458,57 +458,31 @@
           (AlignBranchType & X86AlignBranchKind::AlignBranchIndirect));
 }
 
-static bool canReuseBoundaryAlignFragment(const MCBoundaryAlignFragment &F) {
-  // If a MCBoundaryAlignFragment has not been used to emit NOP,we can reuse it.
-  return !F.canEmitNops();
-}
-
-MCBoundaryAlignFragment *
-X86AsmBackend::getOrCreateBoundaryAlignFragment(MCObjectStreamer &OS) const {
-  auto *F = dyn_cast_or_null<MCBoundaryAlignFragment>(OS.getCurrentFragment());
-  if (!F || !canReuseBoundaryAlignFragment(*F)) {
-    F = new MCBoundaryAlignFragment(AlignBoundary);
-    OS.insert(F);
-  }
-  return F;
-}
-
 /// Insert MCBoundaryAlignFragment before instructions to align branches.
 void X86AsmBackend::alignBranchesBegin(MCObjectStreamer &OS,
                                        const MCInst &Inst) {
   if (!needAlign(OS))
     return;
 
-  MCFragment *CF = OS.getCurrentFragment();
+  // Summary of pending BA scheme
+  // - If we encounter the first instruction in a fusible pair, insert a BA
+  //   fragment.
+  // - If we emit anything which isn't part of a fusible pair, clear pending.
+  // - If we ecounter an jmp/branch/etc, insert a BA fragment if there isn't
+  //   already one pending for a valid fusible sequence.
+  // - Close the pending sequence after any alignable instruction (since they
+  //   also terminate fusible sequences.)
+  // Note that the BoundaryAlign fragment only has semantic meaning once a
+  // valid LastFragment is set.  Otherwise, it remains a nop.
+
   bool NeedAlignFused = AlignBranchType & X86AlignBranchKind::AlignBranchFused;
-  if (NeedAlignFused && isMacroFused(PrevInst, Inst) && CF) {
-    // Macro fusion actually happens and there is no other fragment inserted
-    // after the previous instruction. NOP can be emitted in PF to align fused
-    // jcc.
-    if (auto *PF =
-            dyn_cast_or_null<MCBoundaryAlignFragment>(CF->getPrevNode())) {
-      const_cast<MCBoundaryAlignFragment *>(PF)->setEmitNops(true);
-      const_cast<MCBoundaryAlignFragment *>(PF)->setFused(true);
-    }
-  } else if (needAlignInst(Inst)) {
-    // Note: When there is at least one fragment, such as MCAlignFragment,
-    // inserted after the previous instruction, e.g.
-    //
-    // \code
-    //   cmp %rax %rcx
-    //   .align 16
-    //   je .Label0
-    // \ endcode
-    //
-    // We will treat the JCC as a unfused branch although it may be fused
-    // with the CMP.
-    auto *F = getOrCreateBoundaryAlignFragment(OS);
-    F->setEmitNops(true);
-    F->setFused(false);
-  } else if (NeedAlignFused && isFirstMacroFusibleInst(Inst, *MCII)) {
-    // We don't know if macro fusion happens until the reaching the next
-    // instruction, so a place holder is put here if necessary.
-    getOrCreateBoundaryAlignFragment(OS);
+  if (!needAlignInst(Inst) && PendingBA)
+    PendingBA = nullptr;
+  if ((needAlignInst(Inst) && (!PendingBA ||
+                               !isMacroFused(PrevInst, Inst))) ||
+      (NeedAlignFused && isFirstMacroFusibleInst(Inst, *MCII))) {
+    PendingBA = new MCBoundaryAlignFragment(AlignBoundary);
+    OS.insert(PendingBA);
   }
 
   PrevInst = Inst;
@@ -517,20 +491,28 @@
 /// Insert a MCBoundaryAlignFragment to mark the end of the branch to be aligned
 /// if necessary.
 void X86AsmBackend::alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) {
-  if (!needAlign(OS))
+  if (!needAlign(OS) || !needAlignInst(Inst))
+    return;
+
+  auto *BA = PendingBA;
+  if (!BA)
+    return;
+  PendingBA = nullptr;
+
+  auto *CF = OS.getCurrentFragment();
+  if (!CF)
     return;
-  // If the branch is emitted into a MCRelaxableFragment, we can determine the
-  // size of the branch easily in MCAssembler::relaxBoundaryAlign. When the
-  // branch is fused, the fused branch(macro fusion pair) must be emitted into
-  // two fragments. Or when the branch is unfused, the branch must be emitted
-  // into one fragment. The MCRelaxableFragment naturally marks the end of the
-  // fused or unfused branch.
-  // Otherwise, we need to insert a MCBoundaryAlignFragment to mark the end of
-  // the branch. This MCBoundaryAlignFragment may be reused to emit NOP to align
-  // other branch.
-  if (needAlignInst(Inst) && !isa<MCRelaxableFragment>(OS.getCurrentFragment()))
-    OS.insert(new MCBoundaryAlignFragment(AlignBoundary));
 
+  // Define the range of fragments being aligned.
+  BA->setLastFragment(CF);
+  
+  // We need to ensure that further instructions aren't added to the current
+  // fragment.  The easiest way is to insert a new empty DataFragment.
+  // RelaxableFragments are always single instructions, and are common for
+  // jumps, so avoid inserting a potentially unused fragment in that case.
+  if (!isa<MCRelaxableFragment>(CF))
+    OS.insert(new MCDataFragment());
+  
   // Update the maximum alignment on the current section if necessary.
   MCSection *Sec = OS.getCurrentSectionOnly();
   if (AlignBoundary.value() > Sec->getAlignment())
Index: llvm/test/MC/X86/align-branch-64-negative.s
===================================================================
--- llvm/test/MC/X86/align-branch-64-negative.s
+++ llvm/test/MC/X86/align-branch-64-negative.s
@@ -4,13 +4,13 @@
   # in the assembler.  These are the examples brought up in review and
   # discussion which motivate the need for a assembler directive to
   # selectively enable/disable.
-  # FIXME: the checks are checking current *incorrect* behavior
+  # FIXME: (some of) the checks are checking current *incorrect* behavior
 
   .text
 
   # In the first test, we have a label which is expected to be bound to the
   # start of the call.  For instance, we want to associate a fault on the call
-  # target with some bit of higher level sementic.
+  # target with some bit of higher level sementic. (FIXME)
   # CHECK: labeled_call_test1:
   # CHECK: 1f label_before
   # CHECK: 1f: nop
@@ -29,8 +29,8 @@
   # the return address of the call.
   # CHECK: labeled_call_test2:
   # CHECK: 5a: callq
+  # CHECK: 5f label_after
   # CHECK: 5f: nop
-  # CHECK: 60 label_after
   # CHECK: 60: jmp
   .globl  labeled_call_test2
   .p2align  5
@@ -44,7 +44,7 @@
 
   # Our third test is like the first w/a labeled fault, but specifically to
   # a fused memory comparison.  This is the form produced by implicit null
-  # checks for instance.
+  # checks for instance. (FIXME)
   # CHECK: implicit_null_check:
   # CHECK: 9f fault_addr
   # CHECK: 9f: nop