Index: llvm/include/llvm/MC/MCAsmBackend.h
===================================================================
--- llvm/include/llvm/MC/MCAsmBackend.h
+++ llvm/include/llvm/MC/MCAsmBackend.h
@@ -46,6 +46,10 @@
 
   const support::endianness Endian;
 
+  /// Return true if this target might automatically pad instructions and thus
+  /// need to emit padding enable/disable directives around sensative code.
+  virtual bool allowAutoPadding() const { return false; }
+
   /// Give the target a chance to manipulate state related to instruction
   /// alignment (e.g. padding for optimization) before and after actually
   /// emitting the instruction.
Index: llvm/include/llvm/MC/MCStreamer.h
===================================================================
--- llvm/include/llvm/MC/MCStreamer.h
+++ llvm/include/llvm/MC/MCStreamer.h
@@ -222,6 +222,13 @@
 
   bool UseAssemblerInfoForParsing;
 
+  /// Is the assembler allowed to insert padding automatically?  For
+  /// correctness reasons, we sometimes need to ensure instructions aren't
+  /// seperated in unexpected ways.  At the moment, this feature is only
+  /// useable from an integrated assembler, but assembly syntax is under
+  /// discussion for future inclusion.
+  bool AllowAutoPadding = false;
+
 protected:
   MCStreamer(MCContext &Ctx);
 
@@ -266,6 +273,9 @@
     return TargetStreamer.get();
   }
 
+  void setAllowAutoPadding(bool v) { AllowAutoPadding = v; }
+  bool getAllowAutoPadding() const { return AllowAutoPadding; }
+
   /// When emitting an object file, create and emit a real label. When emitting
   /// textual assembly, this should do nothing to avoid polluting our output.
   virtual MCSymbol *EmitCFILabel();
Index: llvm/lib/MC/MCAsmStreamer.cpp
===================================================================
--- llvm/lib/MC/MCAsmStreamer.cpp
+++ llvm/lib/MC/MCAsmStreamer.cpp
@@ -77,6 +77,7 @@
     assert(InstPrinter);
     if (IsVerboseAsm)
         InstPrinter->setCommentStream(CommentStream);
+    setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
   }
 
   MCAssembler &getAssembler() { return *Assembler; }
Index: llvm/lib/MC/MCObjectStreamer.cpp
===================================================================
--- llvm/lib/MC/MCObjectStreamer.cpp
+++ llvm/lib/MC/MCObjectStreamer.cpp
@@ -29,7 +29,9 @@
     : MCStreamer(Context),
       Assembler(std::make_unique<MCAssembler>(
           Context, std::move(TAB), std::move(Emitter), std::move(OW))),
-      EmitEHFrame(true), EmitDebugFrame(false) {}
+      EmitEHFrame(true), EmitDebugFrame(false) {
+  setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
+}
 
 MCObjectStreamer::~MCObjectStreamer() {}
 
Index: llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
===================================================================
--- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -155,6 +155,7 @@
     AlignBranchType = X86AlignBranchKindLoc;
   }
 
+  bool allowAutoPadding() const override;
   void alignBranchesBegin(MCObjectStreamer &OS, const MCInst &Inst) override;
   void alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) override;
 
@@ -410,10 +411,15 @@
   return false;
 }
 
+bool X86AsmBackend::allowAutoPadding() const {
+  return (AlignBoundary != Align::None() &&
+          AlignBranchType != X86::AlignBranchNone);
+}
+
 bool X86AsmBackend::needAlign(MCObjectStreamer &OS) const {
-  if (AlignBoundary == Align::None() ||
-      AlignBranchType == X86::AlignBranchNone)
+  if (!OS.getAllowAutoPadding())
     return false;
+  assert(allowAutoPadding() && "incorrect initialization!");
 
   MCAssembler &Assembler = OS.getAssembler();
   MCSection *Sec = OS.getCurrentSectionOnly();
Index: llvm/lib/Target/X86/X86MCInstLower.cpp
===================================================================
--- llvm/lib/Target/X86/X86MCInstLower.cpp
+++ llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1142,10 +1142,37 @@
   }
 }
 
+/// A RAII helper which defines a region of instructions which can't have
+/// padding added between them for correctness.
+struct NoAutoPaddingScope {
+  MCStreamer &OS;
+  bool OldAllowAutoPadding;
+  NoAutoPaddingScope(MCStreamer &OS)
+    : OS(OS) {
+    OldAllowAutoPadding = OS.getAllowAutoPadding();
+    changeAndComment(false);
+  }
+  ~NoAutoPaddingScope() {
+    changeAndComment(OldAllowAutoPadding);
+  }
+
+  void changeAndComment(bool b) {
+    if (b == OS.getAllowAutoPadding())
+      return;
+    OS.setAllowAutoPadding(b);
+    if (b)
+      OS.emitRawComment("autopadding");
+    else
+      OS.emitRawComment("noautopadding");
+  }
+};
+
 void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
                                     X86MCInstLower &MCIL) {
   assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64");
 
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
   StatepointOpers SOpers(&MI);
   if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
     EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(),
@@ -1207,6 +1234,8 @@
   // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
   //                  <opcode>, <operands>
 
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
   Register DefRegister = FaultingMI.getOperand(0).getReg();
   FaultMaps::FaultKind FK =
       static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
@@ -1253,6 +1282,8 @@
 void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
                                       X86MCInstLower &MCIL) {
   // PATCHABLE_OP minsize, opcode, operands
+  
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
 
   unsigned MinSize = MI.getOperand(0).getImm();
   unsigned Opcode = MI.getOperand(1).getImm();
@@ -1291,7 +1322,7 @@
 // <id>, <shadowBytes>, ...
 void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
   SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
-
+  
   auto &Ctx = OutStreamer->getContext();
   MCSymbol *MILabel = Ctx.createTempSymbol();
   OutStreamer->EmitLabel(MILabel);
@@ -1309,6 +1340,8 @@
 
   SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
 
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
   auto &Ctx = OutStreamer->getContext();
   MCSymbol *MILabel = Ctx.createTempSymbol();
   OutStreamer->EmitLabel(MILabel);
@@ -1368,6 +1401,8 @@
                                               X86MCInstLower &MCIL) {
   assert(Subtarget->is64Bit() && "XRay custom events only supports X86-64");
 
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
   // We want to emit the following pattern, which follows the x86 calling
   // convention to prepare for the trampoline call to be patched in.
   //
@@ -1462,6 +1497,8 @@
                                                     X86MCInstLower &MCIL) {
   assert(Subtarget->is64Bit() && "XRay typed events only supports X86-64");
 
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
   // We want to emit the following pattern, which follows the x86 calling
   // convention to prepare for the trampoline call to be patched in.
   //
@@ -1559,6 +1596,9 @@
 
 void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
                                                   X86MCInstLower &MCIL) {
+
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+  
   // We want to emit the following pattern:
   //
   //   .p2align 1, ...
@@ -1586,6 +1626,8 @@
 
 void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
                                        X86MCInstLower &MCIL) {
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
   // Since PATCHABLE_RET takes the opcode of the return statement as an
   // argument, we use that to emit the correct form of the RET that we want.
   // i.e. when we see this:
@@ -1616,6 +1658,8 @@
 
 void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
                                              X86MCInstLower &MCIL) {
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
+
   // Like PATCHABLE_RET, we have the actual instruction in the operands to this
   // instruction so we lower that particular instruction and its operands.
   // Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how
Index: llvm/test/CodeGen/X86/align-branch-boundary-noautopadding.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/align-branch-boundary-noautopadding.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -O3 -mcpu=skylake -x86-align-branch-boundary=32 -x86-align-branch=call -filetype=obj < %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
+
+;; This file is a companion to align-branch-boundary-suppressions.ll.
+;; It exists to demonstrate that suppressions are actually wired into the
+;; integrated assembler.
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+define void @test_statepoint(i32 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK: 1f: callq
+entry:
+  ; Each of these will be 5 bytes, pushing the statepoint to offset=30.
+  ; For a normal call, this would force padding between the last normal
+  ; call and the safepoint, but since we've suppressed alignment that won't
+  ; happen for the safepoint.  That's non-ideal, we'd really prefer to do
+  ; the alignment and just keep the label with the statepoint call. (TODO)
+  call void @foo()
+  call void @foo()
+  call void @foo()
+  call void @foo()
+  call void @foo()
+  call void @foo()
+  call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0)
+  ret void
+}
+
+declare void @foo()
+declare zeroext i1 @return_i1()
+declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
Index: llvm/test/CodeGen/X86/align-branch-boundary-suppressions.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/align-branch-boundary-suppressions.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -O3 -enable-implicit-null-checks -mcpu=skylake -x86-align-branch-boundary=32 -x86-align-branch=call+jmp+indirect+ret+jcc < %s | FileCheck %s
+
+;; The tests in this file check that various constructs which need to disable
+;; prefix and/or nop padding do so in the right places.  However, since we
+;; don't yet have assembler syntax for this, they're only able to check
+;; comments and must hope the assembler does the right thing.
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+; If we have autopadding enabled, make sure the label isn't separated from
+; the mov.
+define i32 @implicit_null_check(i32* %x) {
+; CHECK-LABEL: implicit_null_check:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #noautopadding
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    movl (%rdi), %eax # on-fault: .LBB0_1
+; CHECK-NEXT:    #autopadding
+; CHECK-NEXT:  # %bb.2: # %not_null
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_1: # %is_null
+; CHECK-NEXT:    movl $42, %eax
+; CHECK-NEXT:    retq
+
+ entry:
+  %c = icmp eq i32* %x, null
+  br i1 %c, label %is_null, label %not_null, !make.implicit !{}
+
+ is_null:
+  ret i32 42
+
+ not_null:
+  %t = load atomic i32, i32* %x unordered, align 4
+  ret i32 %t
+}
+
+; Label must bind to call before
+define void @test_statepoint(i32 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test_statepoint:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    #noautopadding
+; CHECK-NEXT:    callq return_i1
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:    #autopadding
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0)
+  ret void
+}
+
+declare zeroext i1 @return_i1()
+declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
+
+
+; Label must bind to following nop sequence
+define void @patchpoint(i64 %a, i64 %b) {
+; CHECK-LABEL: patchpoint:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    #noautopadding
+; CHECK-NEXT:  .Ltmp2:
+; CHECK-NEXT:    .byte 102
+; CHECK-NEXT:    .byte 102
+; CHECK-NEXT:    .byte 102
+; CHECK-NEXT:    .byte 102
+; CHECK-NEXT:    .byte 102
+; CHECK-NEXT:    nopw %cs:512(%rax,%rax)
+; CHECK-NEXT:    #autopadding
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:    retq
+entry:
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4, i32 15, i8* null, i32 0, i64 %a, i64 %b)
+  ret void
+}
+
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)