Index: llvm/include/llvm/MC/MCAsmBackend.h
===================================================================
--- llvm/include/llvm/MC/MCAsmBackend.h
+++ llvm/include/llvm/MC/MCAsmBackend.h
@@ -23,6 +23,7 @@
 class MCAsmLayout;
 class MCAssembler;
 class MCCFIInstruction;
+class MCCodeEmitter;
 struct MCFixupKindInfo;
 class MCFragment;
 class MCInst;
@@ -168,6 +169,14 @@
 
   /// @}
 
+  /// Expand the instruction encoding of RF up to RemainingSize bytes.  This is
+  /// used as an alternative padding mechanism to nops for alignment purposes.
+  virtual bool padInstructionEncoding(MCRelaxableFragment &RF,
+                                      MCCodeEmitter &Emitter,
+                                      unsigned &RemainingSize) {
+    return false;
+  }
+
   /// Returns the minimum size of a nop in bytes on this target. The assembler
   /// will use this to emit excess padding in situations where the padding
   /// required for simple alignment would be less than the minimum nop size.
Index: llvm/include/llvm/MC/MCAssembler.h
===================================================================
--- llvm/include/llvm/MC/MCAssembler.h
+++ llvm/include/llvm/MC/MCAssembler.h
@@ -203,6 +203,10 @@
                               MCCVInlineLineTableFragment &DF);
   bool relaxCVDefRange(MCAsmLayout &Layout, MCCVDefRangeFragment &DF);
 
+  /// Once relaxation is complete, try to reduce number of nops required
+  /// without requiring any further relaxation.
+  void optimizeLayout(MCAsmLayout &Layout);
+
   /// finishLayout - Finalize a layout, including fragment lowering.
   void finishLayout(MCAsmLayout &Layout);
 
Index: llvm/lib/MC/MCAssembler.cpp
===================================================================
--- llvm/lib/MC/MCAssembler.cpp
+++ llvm/lib/MC/MCAssembler.cpp
@@ -791,6 +791,12 @@
       errs() << "assembler backend - post-relaxation\n--\n";
       dump(); });
 
+  optimizeLayout(Layout);
+
+  DEBUG_WITH_TYPE("mc-dump", {
+      errs() << "assembler backend - post-optimization\n--\n";
+      dump(); });
+
   // Finalize the layout, including fragment lowering.
   finishLayout(Layout);
 
@@ -1019,6 +1025,7 @@
   BF.setSize(NewSize);
   Layout.invalidateFragmentsFrom(&BF);
   return true;
+
 }
 
 bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout,
@@ -1164,6 +1171,108 @@
   return WasRelaxed;
 }
 
+void MCAssembler::optimizeLayout(MCAsmLayout &Layout) {
+  // See if we can further relax some instructions to cut down on the number of
+  // nop bytes required for code alignment.  The actual win is in reducing
+  // instruction count, not number of bytes.  Some micro-architectures (such
+  // as, say, modern X86-64) can easily end up decode limited.  It is
+  // often better to reduce the number of instructions (i.e. eliminate nops)
+  // even at the cost of increasing the size and complexity of others.
+
+  DenseSet<MCFragment *> LabeledFragments;
+  for (const MCSymbol &S : symbols())
+    LabeledFragments.insert(S.getFragment(false));
+
+  for (iterator it = begin(), ie = end(); it != ie; ++it) {
+    MCSection &Sec = *it;
+    if (!Sec.getKind().isText())
+      continue;
+
+    SmallVector<MCRelaxableFragment*, 4> Relaxable;
+    for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) {
+      MCFragment &F = *I;
+
+      if (LabeledFragments.count(&F))
+        Relaxable.clear();
+
+      if (F.getKind() ==  MCFragment::FT_Data)
+        // Skip and ignore
+        continue;
+
+      if (F.getKind() == MCFragment::FT_Relaxable) {
+        auto &RF = cast<MCRelaxableFragment>(*I);
+        Relaxable.push_back(&RF);
+        continue;
+      }
+
+      // For any unhandled kind, assume we can't change layout.
+      if (F.getKind() != MCFragment::FT_Align &&
+          F.getKind() != MCFragment::FT_BoundaryAlign) {
+        Relaxable.clear();
+        continue;
+      }
+      const unsigned OrigOffset = Layout.getFragmentOffset(&F);
+      const unsigned OrigSize = computeFragmentSize(Layout, F);
+      if (OrigSize == 0 || Relaxable.empty()) {
+        Relaxable.clear();
+        continue;
+      }
+
+      // To keep the effects local, prefer to relax instructions closest to
+      // the align directive.  This is purely about human understandability
+      // of the resulting code.  If we later find a reason to expand
+      // particular instructions over others, we can adjust.
+      MCFragment *FirstChangedFragment = nullptr;
+      unsigned RemainingSize = OrigSize;
+      while (!Relaxable.empty() && RemainingSize != 0) {
+        auto &RF = *Relaxable.pop_back_val();
+        // Give the backend a chance to play any tricks it wishes to increase
+        // the encoding size of the given instruction.  Target independent code
+        // will try further relaxation, but target's may play further tricks.
+        if (getBackend().padInstructionEncoding(RF, getEmitter(),
+                                                RemainingSize))
+          FirstChangedFragment = &RF;
+
+        if (getBackend().mayNeedRelaxation(RF.getInst(),
+                                           *RF.getSubtargetInfo())) {
+          // If we have an instruction which hasn't been fully relaxed, we
+          // can't skip past it and insert bytes before it.  Changing it's
+          // starting offset might require a larger negative offset than it can
+          // encode.  We don't need to worry about larger positive offsets as
+          // none of the possible offsets between this and our align are
+          // visible, and the ones afterwards aren't changing.
+          break;
+        }
+      }
+      Relaxable.clear();
+
+      // Unlike align, boundary align tracks it's own size after relaxation.
+      if (F.getKind() == MCFragment::FT_BoundaryAlign)
+        cast<MCBoundaryAlignFragment>(F).setSize(RemainingSize);
+
+      if (FirstChangedFragment) {
+        // Redo the layout for any fragements in the effected range.  This is
+        // mostly updating start offsets, but also may need to apply other
+        // updates (such as changing offsets) to the fragments in question.
+        // Note that the relaxation itself has already been done above, and
+        // thus the total size of the range isn't changing.
+        Layout.invalidateFragmentsFrom(FirstChangedFragment);
+        while (FirstChangedFragment != &F) {
+          relaxFragment(Layout, *FirstChangedFragment);
+          FirstChangedFragment = FirstChangedFragment->getNextNode();
+        }
+      }
+
+      const unsigned FinalOffset = Layout.getFragmentOffset(&F);
+      const unsigned FinalSize = computeFragmentSize(Layout, F);
+      assert(OrigOffset + OrigSize == FinalOffset + FinalSize &&
+             "can't move start of next fragment!");
+      assert(FinalSize == RemainingSize && "inconsistent size computation?");
+    }
+  }
+}
+
+
 void MCAssembler::finishLayout(MCAsmLayout &Layout) {
   assert(getBackendPtr() && "Expected assembler backend");
   // The layout is done. Mark every fragment as valid.
Index: llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
===================================================================
--- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -13,6 +13,7 @@
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -175,6 +176,10 @@
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override;
 
+  bool padInstructionEncoding(MCRelaxableFragment &RF,
+                              MCCodeEmitter &Emitter,
+                              unsigned &RemainingSize) override;
+
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 } // end anonymous namespace
@@ -632,6 +637,42 @@
   Res.setOpcode(RelaxedOp);
 }
 
+static bool canBeRelaxedForPadding(const MCRelaxableFragment &RF) {
+  // TODO: There are lots of other tricks we could apply for increasing
+  // encoding size without impacting performance.
+  auto &Inst = RF.getInst();
+  auto &STI = *RF.getSubtargetInfo();
+  bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+  return getRelaxedOpcode(Inst, is16BitMode) != Inst.getOpcode();
+}
+
+bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
+                                           MCCodeEmitter &Emitter,
+                                           unsigned &RemainingSize) {
+  if (!canBeRelaxedForPadding(RF))
+    return false;
+
+  MCInst Relaxed;
+  relaxInstruction(RF.getInst(), *RF.getSubtargetInfo(), Relaxed);
+
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  Emitter.encodeInstruction(Relaxed, VecOS, Fixups,
+                            *RF.getSubtargetInfo());
+  const unsigned OldSize = RF.getContents().size();
+  const unsigned NewSize = Code.size();
+  assert(NewSize >= OldSize && "size decrease during relaxation?");
+  unsigned Delta = NewSize - OldSize;
+  if (Delta > RemainingSize)
+    return false;
+  RF.setInst(Relaxed);
+  RF.getContents() = Code;
+  RF.getFixups() = Fixups;
+  RemainingSize -= Delta;
+  return true;
+}
+
 /// Write a sequence of optimal nops to the output, covering \p Count
 /// bytes.
 /// \return - true on success, false on failure
Index: llvm/test/MC/X86/align-branch-64.s
===================================================================
--- llvm/test/MC/X86/align-branch-64.s
+++ llvm/test/MC/X86/align-branch-64.s
@@ -103,6 +103,24 @@
 bar:
   retq
 
+  # CHECK: test_pad_via_relax:
+  # CHECK: 200: testq
+  # CHECK: 203: jne
+  # CHECK: 209: int3
+  # note 6 byte jne which could be a 2 byte jne, but is instead
+  # expanded for padding purposes
+  # CHECK-NOT: nop
+  # CHECK: 220: callq
+  .global test_pad_via_relax
+  .p2align  5
+test_pad_via_relax:
+  testq %rax, %rax
+  jnz bar
+  .rept 23
+  int3
+  .endr
+  callq bar
+
   .section "unknown"
   .p2align 4
   .type   baz,@function
Index: llvm/test/MC/X86/align-via-relaxation.s
===================================================================
--- /dev/null
+++ llvm/test/MC/X86/align-via-relaxation.s
@@ -0,0 +1,74 @@
+  # RUN: llvm-mc -mcpu=skylake -filetype=obj -triple x86_64-pc-linux-gnu %s | llvm-objdump -d --section=.text - | FileCheck %s
+
+
+  .file	"test.c"
+	.text
+	.section	.text
+  # Demonstrate that we can relax instructions to provide padding, not
+  # just insert nops.  jmps are being used for ease of demonstration.
+  # CHECK: .text
+  # CHECK: 0: eb 1f                        	jmp 31 <foo>
+  # CHECK: 2: e9 1a 00 00 00               	jmp 26 <foo>
+  # CHECK: 7: e9 15 00 00 00               	jmp 21 <foo>
+  # CHECK: c: e9 10 00 00 00               	jmp 16 <foo>
+  # CHECK: 11: e9 0b 00 00 00               jmp 11 <foo>
+  # CHECK: 16: e9 06 00 00 00               jmp 6 <foo>
+  # CHECK: 1b: e9 01 00 00 00               jmp 1 <foo>
+  # CHECK: 20: cc                           int3
+	.p2align 4
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+	.p2align 5
+  int3
+foo:
+  ret
+
+  # Check that we're not shifting aroudn the offsets of labels - doing
+  # that would require a further round of relaxation
+  # CHECK: bar:
+  # CHECK: 22: eb fe                          jmp -2 <bar>
+  # CHECK: 24: 66 2e 0f 1f 84 00 00 00 00 00	nopw %cs:(%rax,%rax)
+  # CHECK: 2e: 66 90                        	nop
+  # CHECK: 30: 0f 0b                        	ud2
+
+bar:  
+  jmp bar
+nobypass:
+  .p2align 4
+  ud2
+
+
+  # Canonical toy loop to show benefit - we can align the loop header with
+  # fewer nops by relaxing the branch, even though we don't need to
+  # CHECK: loop_preheader:
+  # CHECK: 45: 48 85 c0                     	testq %rax, %rax
+  # CHECK: 48: 0f 8e 22 00 00 00            	jle 34 <loop_exit>
+  # CHECK: 4e: 66 2e 0f 1f 84 00 00 00 00 00	nopw %cs:(%rax,%rax)
+  # CHECK: 58: 0f 1f 84 00 00 00 00 00      	nopl (%rax,%rax)
+  # CHECK: loop_header:
+  # CHECK: 60: 48 83 e8 01                  	subq $1, %rax
+  # CHECK: 64: 48 85 c0                     	testq %rax, %rax
+  # CHECK: 67: 7e 07                        	jle 7 <loop_exit>
+  # CHECK: 69: e9 f2 ff ff ff               	jmp -14 <loop_header>
+  # CHECK: 6e: 66 90                        	nop
+  # CHECK: loop_exit:
+  # CHECK: 70: c3                           	retq
+  .p2align 5
+  .skip 5
+loop_preheader:
+  testq %rax, %rax
+  jle loop_exit
+  .p2align 5
+loop_header:
+  subq $1, %rax
+  testq %rax, %rax
+  jle loop_exit
+  jmp loop_header
+  .p2align 4
+loop_exit:
+  ret