diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -12,7 +12,9 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -103,6 +105,14 @@
         "assumptions about labels corresponding to particular instructions, "
         "and should be used with caution."));
 
+cl::opt<bool> X86PadForAlign(
+    "x86-pad-for-align", cl::init(true), cl::Hidden,
+    cl::desc("Pad previous instructions to implement align directives"));
+
+cl::opt<bool> X86PadForBranchAlign(
+    "x86-pad-for-branch-align", cl::init(true), cl::Hidden,
+    cl::desc("Pad previous instructions to implement branch alignment"));
+
 class X86ELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine,
@@ -173,6 +183,10 @@
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override;
 
+  bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
+                              unsigned &RemainingSize) const;
+  void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override;
+
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 } // end anonymous namespace
@@ -639,6 +653,168 @@
   Res.setOpcode(RelaxedOp);
 }
 
+static bool canBeRelaxedForPadding(const MCRelaxableFragment &RF) {
+  // TODO: There are lots of other tricks we could apply for increasing
+  // encoding size without impacting performance.
+  auto &Inst = RF.getInst();
+  auto &STI = *RF.getSubtargetInfo();
+  bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+  return getRelaxedOpcode(Inst, is16BitMode) != Inst.getOpcode();
+}
+
+bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
+                                           MCCodeEmitter &Emitter,
+                                           unsigned &RemainingSize) const {
+  if (!canBeRelaxedForPadding(RF))
+    return false;
+
+  MCInst Relaxed;
+  relaxInstruction(RF.getInst(), *RF.getSubtargetInfo(), Relaxed);
+
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<15> Code;
+  raw_svector_ostream VecOS(Code);
+  Emitter.encodeInstruction(Relaxed, VecOS, Fixups, *RF.getSubtargetInfo());
+  const unsigned OldSize = RF.getContents().size();
+  const unsigned NewSize = Code.size();
+  assert(NewSize >= OldSize && "size decrease during relaxation?");
+  unsigned Delta = NewSize - OldSize;
+  if (Delta > RemainingSize)
+    return false;
+  RF.setInst(Relaxed);
+  RF.getContents() = Code;
+  RF.getFixups() = Fixups;
+  RemainingSize -= Delta;
+  return true;
+}
+
+void X86AsmBackend::finishLayout(MCAssembler const &Asm,
+                                 MCAsmLayout &Layout) const {
+  // See if we can further relax some instructions to cut down on the number of
+  // nop bytes required for code alignment.  The actual win is in reducing
+  // instruction count, not number of bytes.  Modern X86-64 can easily end up
+  // decode limited.  It is often better to reduce the number of instructions
+  // (i.e. eliminate nops) even at the cost of increasing the size and
+  // complexity of others.
+  if (!X86PadForAlign && !X86PadForBranchAlign)
+    return;
+
+  DenseSet<MCFragment *> LabeledFragments;
+  for (const MCSymbol &S : Asm.symbols())
+    LabeledFragments.insert(S.getFragment(false));
+
+  for (MCSection &Sec : Asm) {
+    if (!Sec.getKind().isText())
+      continue;
+
+    SmallVector<MCRelaxableFragment *, 4> Relaxable;
+    for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) {
+      MCFragment &F = *I;
+
+      if (LabeledFragments.count(&F))
+        Relaxable.clear();
+
+      if (F.getKind() == MCFragment::FT_Data ||
+          F.getKind() == MCFragment::FT_CompactEncodedInst)
+        // Skip and ignore
+        continue;
+
+      if (F.getKind() == MCFragment::FT_Relaxable) {
+        auto &RF = cast<MCRelaxableFragment>(*I);
+        Relaxable.push_back(&RF);
+        continue;
+      }
+
+      auto canHandle = [](MCFragment &F) -> bool {
+        switch (F.getKind()) {
+        default:
+          return false;
+        case MCFragment::FT_Align:
+          return X86PadForAlign;
+        case MCFragment::FT_BoundaryAlign:
+          return X86PadForBranchAlign;
+        }
+      };
+      // For any unhandled kind, assume we can't change layout.
+      if (!canHandle(F)) {
+        Relaxable.clear();
+        continue;
+      }
+
+      const uint64_t OrigOffset = Layout.getFragmentOffset(&F);
+      const uint64_t OrigSize = Asm.computeFragmentSize(Layout, F);
+      if (OrigSize == 0 || Relaxable.empty()) {
+        Relaxable.clear();
+        continue;
+      }
+
+      // To keep the effects local, prefer to relax instructions closest to
+      // the align directive.  This is purely about human understandability
+      // of the resulting code.  If we later find a reason to expand
+      // particular instructions over others, we can adjust.
+      MCFragment *FirstChangedFragment = nullptr;
+      unsigned RemainingSize = OrigSize;
+      while (!Relaxable.empty() && RemainingSize != 0) {
+        auto &RF = *Relaxable.pop_back_val();
+        // Give the backend a chance to play any tricks it wishes to increase
+        // the encoding size of the given instruction.  Target independent code
+        // will try further relaxation, but target's may play further tricks.
+        if (padInstructionEncoding(RF, Asm.getEmitter(), RemainingSize))
+          FirstChangedFragment = &RF;
+
+        // If we have an instruction which hasn't been fully relaxed, we can't
+        // skip past it and insert bytes before it.  Changing its starting
+        // offset might require a larger negative offset than it can encode.
+        // We don't need to worry about larger positive offsets as none of the
+        // possible offsets between this and our align are visible, and the
+        // ones afterwards aren't changing.
+        if (mayNeedRelaxation(RF.getInst(), *RF.getSubtargetInfo()))
+          break;
+      }
+      Relaxable.clear();
+
+      if (FirstChangedFragment) {
+        // Make sure the offsets for any fragments in the effected range get
+        // updated.  Note that this (conservatively) invalidates the offsets of
+        // those following, but this is not required.
+        Layout.invalidateFragmentsFrom(FirstChangedFragment);
+      }
+
+      // BoundaryAlign explicitly tracks it's size (unlike align)
+      if (F.getKind() == MCFragment::FT_BoundaryAlign)
+        cast<MCBoundaryAlignFragment>(F).setSize(RemainingSize);
+
+      const uint64_t FinalOffset = Layout.getFragmentOffset(&F);
+      const uint64_t FinalSize = Asm.computeFragmentSize(Layout, F);
+      assert(OrigOffset + OrigSize == FinalOffset + FinalSize &&
+             "can't move start of next fragment!");
+      assert(FinalSize == RemainingSize && "inconsistent size computation?");
+
+      // If we're looking at a boundary align, make sure we don't try to pad
+      // its target instructions for some following directive.  Doing so would
+      // break the alignment of the current boundary align.
+      if (F.getKind() == MCFragment::FT_BoundaryAlign) {
+        auto &BF = cast<MCBoundaryAlignFragment>(F);
+        const MCFragment *F = BF.getNextNode();
+        // If the branch is unfused, it is emitted into one fragment, otherwise
+        // it is emitted into two fragments at most, the next
+        // MCBoundaryAlignFragment(if exists) also marks the end of the branch.
+        for (int i = 0, N = BF.isFused() ? 2 : 1;
+             i != N && !isa<MCBoundaryAlignFragment>(F);
+             ++i, F = F->getNextNode(), I++) {
+        }
+      }
+    }
+  }
+
+  // The layout is done. Mark every fragment as valid.
+  for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) {
+    MCSection &Section = *Layout.getSectionOrder()[i];
+    Layout.getFragmentOffset(&*Section.getFragmentList().rbegin());
+    Asm.computeFragmentSize(Layout, *Section.getFragmentList().rbegin());
+  }
+}
+
 /// Write a sequence of optimal nops to the output, covering \p Count
 /// bytes.
 /// \return - true on success, false on failure
diff --git a/llvm/test/MC/X86/align-branch-64.s b/llvm/test/MC/X86/align-branch-64.s
--- a/llvm/test/MC/X86/align-branch-64.s
+++ b/llvm/test/MC/X86/align-branch-64.s
@@ -103,6 +103,59 @@
 bar:
   retq
 
+  # CHECK: test_pad_via_relax:
+  # CHECK: 200: testq
+  # CHECK: 203: jne
+  # CHECK: 209: int3
+  # note 6 byte jne which could be a 2 byte jne, but is instead
+  # expanded for padding purposes
+  # CHECK-NOT: nop
+  # CHECK: 220: callq
+  .global test_pad_via_relax
+  .p2align  5
+test_pad_via_relax:
+  testq %rax, %rax
+  jnz bar
+  .rept 23
+  int3
+  .endr
+  callq bar
+
+  # This case looks really tempting to pad, but doing so for the call causes
+  # the jmp to be misaligned.
+  # CHECK: test_pad_via_relax_neg1:
+  # CHECK: 240: int3
+  # CHECK: 25a: testq
+  # CHECK: 25d: jne
+  # CHECK: 25f: nop
+  # CHECK: 260: callq
+  .global test_pad_via_relax_neg1
+  .p2align  5
+test_pad_via_relax_neg1:
+  .rept 26
+  int3
+  .endr
+  testq %rax, %rax
+  jnz bar
+  callq bar
+
+  # Same as previous, but without fusion
+  # CHECK: test_pad_via_relax_neg2:
+  # CHECK: 280: int3
+  # CHECK: 29d: jmp
+  # CHECK: 29f: nop
+  # CHECK: 2a0: callq
+  .global test_pad_via_relax_neg2
+  .p2align  5
+test_pad_via_relax_neg2:
+  .rept 29
+  int3
+  .endr
+  jmp bar2
+  callq bar2
+
+bar2: 
+
   .section "unknown"
   .p2align 4
   .type   baz,@function
diff --git a/llvm/test/MC/X86/align-via-relaxation.s b/llvm/test/MC/X86/align-via-relaxation.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/X86/align-via-relaxation.s
@@ -0,0 +1,74 @@
+# RUN: llvm-mc -mcpu=skylake -filetype=obj -triple x86_64-pc-linux-gnu %s | llvm-objdump -d --section=.text - | FileCheck %s
+
+
+  .file "test.c"
+  .text
+  .section  .text
+  # Demonstrate that we can relax instructions to provide padding, not
+  # just insert nops.  jmps are being used for ease of demonstration.
+  # CHECK: .text
+  # CHECK: 0: eb 1f                         jmp 31 <foo>
+  # CHECK: 2: e9 1a 00 00 00                jmp 26 <foo>
+  # CHECK: 7: e9 15 00 00 00                jmp 21 <foo>
+  # CHECK: c: e9 10 00 00 00                jmp 16 <foo>
+  # CHECK: 11: e9 0b 00 00 00               jmp 11 <foo>
+  # CHECK: 16: e9 06 00 00 00               jmp 6 <foo>
+  # CHECK: 1b: e9 01 00 00 00               jmp 1 <foo>
+  # CHECK: 20: cc                           int3
+  .p2align 4
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+  .p2align 5
+  int3
+foo:
+  ret
+
+  # Check that we're not shifting aroudn the offsets of labels - doing
+  # that would require a further round of relaxation
+  # CHECK: bar:
+  # CHECK: 22: eb fe                          jmp -2 <bar>
+  # CHECK: 24: 66 2e 0f 1f 84 00 00 00 00 00  nopw %cs:(%rax,%rax)
+  # CHECK: 2e: 66 90                          nop
+  # CHECK: 30: 0f 0b                          ud2
+
+bar:  
+  jmp bar
+nobypass:
+  .p2align 4
+  ud2
+
+
+  # Canonical toy loop to show benefit - we can align the loop header with
+  # fewer nops by relaxing the branch, even though we don't need to
+  # CHECK: loop_preheader:
+  # CHECK: 45: 48 85 c0                       testq %rax, %rax
+  # CHECK: 48: 0f 8e 22 00 00 00              jle 34 <loop_exit>
+  # CHECK: 4e: 66 2e 0f 1f 84 00 00 00 00 00  nopw %cs:(%rax,%rax)
+  # CHECK: 58: 0f 1f 84 00 00 00 00 00        nopl (%rax,%rax)
+  # CHECK: loop_header:
+  # CHECK: 60: 48 83 e8 01                    subq $1, %rax
+  # CHECK: 64: 48 85 c0                       testq %rax, %rax
+  # CHECK: 67: 7e 07                          jle 7 <loop_exit>
+  # CHECK: 69: e9 f2 ff ff ff                 jmp -14 <loop_header>
+  # CHECK: 6e: 66 90                          nop
+  # CHECK: loop_exit:
+  # CHECK: 70: c3                             retq
+  .p2align 5
+  .skip 5
+loop_preheader:
+  testq %rax, %rax
+  jle loop_exit
+  .p2align 5
+loop_header:
+  subq $1, %rax
+  testq %rax, %rax
+  jle loop_exit
+  jmp loop_header
+  .p2align 4
+loop_exit:
+  ret