diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -373,6 +373,27 @@
   return true;
 }
 
+/// X86 has certain instructions which enable interrupts exactly one
+/// instruction *after* the instruction which stores to SS.  Return true if the
+/// given instruction has such an interrupt delay slot.
+static bool hasInterruptDelaySlot(const MCInst &Inst) {
+  switch (Inst.getOpcode()) {
+  case X86::POPSS16:
+  case X86::POPSS32:
+  case X86::STI:
+    return true;
+
+  case X86::MOV16sr:
+  case X86::MOV32sr:
+  case X86::MOV64sr:
+  case X86::MOV16sm:
+    if (Inst.getOperand(0).getReg() == X86::SS)
+      return true;
+    break;
+  }
+  return false;
+}
+
 /// Check if the instruction operand needs to be aligned. Padding is disabled
 /// before intruction which may be rewritten by linker(e.g. TLSCALL).
 bool X86AsmBackend::needAlignInst(const MCInst &Inst) const {
@@ -401,7 +422,10 @@
 
   MCFragment *CF = OS.getCurrentFragment();
   bool NeedAlignFused = AlignBranchType & X86::AlignBranchFused;
-  if (NeedAlignFused && isMacroFused(PrevInst, Inst) && CF) {
+  if (hasInterruptDelaySlot(PrevInst)) {
+    // If this instruction follows an interrupt enabling instruction with a one
+    // instruction delay, inserting a nop would change behavior.
+  } else if (NeedAlignFused && isMacroFused(PrevInst, Inst) && CF) {
     // Macro fusion actually happens and there is no other fragment inserted
     // after the previous instruction. NOP can be emitted in PF to align fused
     // jcc.
diff --git a/llvm/test/MC/X86/align-branch-64-system.s b/llvm/test/MC/X86/align-branch-64-system.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/X86/align-branch-64-system.s
@@ -0,0 +1,68 @@
+  # RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu --x86-align-branch-boundary=32 --x86-align-branch=jmp %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
+
+  # Exercise cases where we're enabling interrupts with one instruction delay
+  # and thus can't add a nop in between without changing behavior.
+
+  .text
+
+  # CHECK: 1e:       sti
+  # CHECK: 1f:       jmp
+  .p2align  5
+  .rept 30
+  int3
+  .endr
+  sti
+  jmp baz
+
+  # CHECK: 5c:       movq %rax, %ss
+  # CHECK: 5f:       jmp
+  .p2align  5
+  .rept 28
+  int3
+  .endr
+  movq %rax, %ss
+  jmp baz
+
+  # CHECK: 9d:       movl %esi, %ss
+  # CHECK: 9f:       jmp
+  .p2align  5
+  .rept 29
+  int3
+  .endr
+  movl %esi, %ss
+  jmp baz
+
+  # movw and movl are interchangeable since we're only using the low 16 bits.
+  # Both are generated as "MOV Sreg,r/m16**" (8E /r), but disassembled as movl
+  # CHECK: dd:       movl %esi, %ss
+  # CHECK: df:       jmp
+  .p2align  5
+  .rept 29
+  int3
+  .endr
+  movw %si, %ss
+  jmp baz
+
+  # CHECK: 11b:       movw (%esi), %ss
+  # CHECK: 11e:       jmp
+  .p2align  5
+  .rept 27
+  int3
+  .endr
+  movw (%esi), %ss
+  jmp baz
+
+  # CHECK: 15b:      	movw	(%rsi), %ss
+  # CHECK: 15d:     	jmp
+  .p2align  5
+  .rept 27
+  int3
+  .endr
+  movw (%rsi), %ss
+  jmp baz
+
+
+  int3
+  .section ".text.other"
+bar:
+  retq