Index: llvm/include/llvm/MC/MCAsmBackend.h
===================================================================
--- llvm/include/llvm/MC/MCAsmBackend.h
+++ llvm/include/llvm/MC/MCAsmBackend.h
@@ -144,6 +144,15 @@
   virtual bool mayNeedRelaxation(const MCInst &Inst,
                                  const MCSubtargetInfo &STI) const = 0;
 
+  /// Return true if the given instruction can be further relaxed.  This is
+  /// different than mayNeedFurther relaxation in that it is allowed to return
+  /// true when relaxation is possible, but not required.
+  virtual bool canBeFurtherRelaxed(const MCInst &Inst,
+                                   const MCSubtargetInfo &STI) const {
+    return mayNeedRelaxation(Inst, STI);
+  }
+
+
   /// Target specific predicate for whether a given fixup requires the
   /// associated instruction to be relaxed.
   virtual bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved,
Index: llvm/include/llvm/MC/MCAssembler.h
===================================================================
--- llvm/include/llvm/MC/MCAssembler.h
+++ llvm/include/llvm/MC/MCAssembler.h
@@ -203,6 +203,10 @@
                               MCCVInlineLineTableFragment &DF);
   bool relaxCVDefRange(MCAsmLayout &Layout, MCCVDefRangeFragment &DF);
 
+  /// Once relaxation is complete, try to reduce number of nops required
+  /// without requiring any further relaxation.
+  void optimizeLayout(MCAsmLayout &Layout);
+
   /// finishLayout - Finalize a layout, including fragment lowering.
   void finishLayout(MCAsmLayout &Layout);
 
Index: llvm/include/llvm/MC/MCFragment.h
===================================================================
--- llvm/include/llvm/MC/MCFragment.h
+++ llvm/include/llvm/MC/MCFragment.h
@@ -274,6 +274,9 @@
   }
 };
 
+/// Represents alignment for either code or data.  For code, the Value and
+/// ValueSize fields are ignored; nops are always emitted.  For date, the
+/// Value is repeated ValueSize times.  Value is usually 0.
 class MCAlignFragment : public MCFragment {
   /// The alignment to ensure, in bytes.
   unsigned Alignment;
Index: llvm/lib/MC/MCAssembler.cpp
===================================================================
--- llvm/lib/MC/MCAssembler.cpp
+++ llvm/lib/MC/MCAssembler.cpp
@@ -791,6 +791,12 @@
       errs() << "assembler backend - post-relaxation\n--\n";
       dump(); });
 
+  optimizeLayout(Layout);
+
+  DEBUG_WITH_TYPE("mc-dump", {
+      errs() << "assembler backend - post-optimization\n--\n";
+      dump(); });
+
   // Finalize the layout, including fragment lowering.
   finishLayout(Layout);
 
@@ -1014,11 +1020,8 @@
   uint64_t NewSize = needPadding(AlignedOffset, AlignedSize, BoundaryAlignment)
                          ? offsetToAlignment(AlignedOffset, BoundaryAlignment)
                          : 0U;
-  if (NewSize == OldSize)
-    return false;
   BF.setSize(NewSize);
-  Layout.invalidateFragmentsFrom(&BF);
-  return true;
+  return (NewSize != OldSize);
 }
 
 bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout,
@@ -1164,6 +1167,125 @@
   return WasRelaxed;
 }
 
+void MCAssembler::optimizeLayout(MCAsmLayout &Layout) {
+  // See if we can further relax some instructions to cut down on the number of
+  // nop bytes required for code alignment.  The actual win is in reducing
+  // instruction count, not number of bytes.  Some micro-architectures (such
+  // as, say, modern X86-64) can easily end up decode limited.  It is
+  // often better to reduce the number of instructions (i.e. eliminate nops)
+  // even at the cost of increasing the size and complexity of others.
+
+  DenseSet<MCFragment *> LabeledFragments;
+  for (const MCSymbol &S : symbols())
+    LabeledFragments.insert(S.getFragment(false));
+
+  for (iterator it = begin(), ie = end(); it != ie; ++it) {
+    MCSection &Sec = *it;
+    if (!Sec.getKind().isText())
+      continue;
+
+    SmallVector<MCRelaxableFragment*, 4> Relaxable;
+    for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) {
+      MCFragment &F = *I;
+
+      if (LabeledFragments.count(&F))
+        Relaxable.clear();
+
+      if (F.getKind() ==  MCFragment::FT_Data)
+        // Skip and ignore
+        continue;
+
+      if (F.getKind() == MCFragment::FT_Relaxable) {
+        auto &RF = cast<MCRelaxableFragment>(*I);
+        if (getBackend().canBeFurtherRelaxed(RF.getInst(),
+                                             *RF.getSubtargetInfo()))
+          Relaxable.push_back(&RF);
+        // Note: It's okay to skip an instruction which can't be further
+        // relaxed as it a) will be a fixed number of bytes, and b) must be
+        // able to encode any larger offsets which result from shifting it
+        // around.
+        continue;
+      }
+
+      // For any unhandled kind, assume we can't change layout.
+      if (F.getKind() != MCFragment::FT_Align &&
+          F.getKind() != MCFragment::FT_BoundaryAlign) {
+        Relaxable.clear();
+        continue;
+      }
+      const unsigned OrigOffset = Layout.getFragmentOffset(&F);
+      const unsigned OrigSize = computeFragmentSize(Layout, F);
+      if (OrigSize == 0 || Relaxable.empty()) {
+        Relaxable.clear();
+        continue;
+      }
+
+      // To keep the effects local, prefer to relax instructions closest to
+      // the align directive.  This is purely about human understandability
+      // of the resulting code.  If we later find a reason to expand
+      // particular instructions over others, we can adjust.
+      MCFragment *FirstChangedFragment = nullptr;
+      unsigned RemainingSize = OrigSize;
+      while (!Relaxable.empty() && RemainingSize != 0) {
+        auto *RF = Relaxable.pop_back_val();
+
+        MCInst Relaxed;
+        getBackend().relaxInstruction(RF->getInst(), *RF->getSubtargetInfo(),
+                                      Relaxed);
+        SmallVector<MCFixup, 4> Fixups;
+        SmallString<256> Code;
+        raw_svector_ostream VecOS(Code);
+        getEmitter().encodeInstruction(Relaxed, VecOS, Fixups,
+                                       *RF->getSubtargetInfo());
+        const unsigned OldSize = RF->getContents().size();
+        const unsigned NewSize = Code.size();
+        assert(NewSize >= OldSize && "size decrease during relaxation?");
+        unsigned Delta = NewSize - OldSize;
+        if (Delta > RemainingSize) {
+          // Too big, can't use this relocation without requiring another
+          // round of relaxation.  Subtly, we can't skip past this one either
+          // as we *haven't* relaxed it.  Changing it's starting offset might
+          // require a larger negative offset than it can encode.  We don't
+          // need to worry about larger positive offsets as none of the
+          // possible offsets between this and our align are visible, and the
+          // ones afterwards aren't changing.
+          break;
+        }
+        RemainingSize -= Delta;
+        RF->setInst(Relaxed);
+        RF->getContents() = Code;
+        RF->getFixups() = Fixups;
+        FirstChangedFragment = RF;
+      }
+      Relaxable.clear();
+
+      // Unlike align, boundary align tracks it's own size after relaxation.
+      if (F.getKind() == MCFragment::FT_BoundaryAlign)
+        cast<MCBoundaryAlignFragment>(F).setSize(RemainingSize);
+      
+      if (FirstChangedFragment) {
+        // Redo the layout for any fragements in the effected range.  This is
+        // mostly updating start offsets, but also may need to apply other
+        // updates (such as changing offsets) to the fragments in question.
+        // Note that the relaxation itself has already been done above, and
+        // thus the total size of the range isn't changing.
+        Layout.invalidateFragmentsFrom(FirstChangedFragment);
+        while (FirstChangedFragment != &F) {
+          relaxFragment(Layout, *FirstChangedFragment);
+          FirstChangedFragment = FirstChangedFragment->getNextNode();
+        }
+      }
+
+      const unsigned FinalOffset = Layout.getFragmentOffset(&F);
+      const unsigned FinalSize = computeFragmentSize(Layout, F);
+      assert(OrigOffset + OrigSize == FinalOffset + FinalSize &&
+             "can't move start of next fragment!");
+      assert(FinalSize == RemainingSize && "inconsistent size computation?");
+    }
+  }
+}
+
+
 void MCAssembler::finishLayout(MCAsmLayout &Layout) {
   assert(getBackendPtr() && "Expected assembler backend");
   // The layout is done. Mark every fragment as valid.
Index: llvm/lib/MC/MCFragment.cpp
===================================================================
--- llvm/lib/MC/MCFragment.cpp
+++ llvm/lib/MC/MCFragment.cpp
@@ -394,6 +394,7 @@
     OS << "\n       ";
     OS << " Inst:";
     F->getInst().dump_pretty(OS);
+    OS << " (" << F->getContents().size() << ")";
     break;
   }
   case MCFragment::FT_Org:  {
Index: llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
===================================================================
--- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -168,6 +168,9 @@
   bool mayNeedRelaxation(const MCInst &Inst,
                          const MCSubtargetInfo &STI) const override;
 
+  bool canBeFurtherRelaxed(const MCInst &Inst,
+                           const MCSubtargetInfo &STI) const override;
+
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override;
@@ -603,6 +606,15 @@
   return false;
 }
 
+bool X86AsmBackend::canBeFurtherRelaxed(const MCInst &Inst,
+                                        const MCSubtargetInfo &STI) const {
+  // TODO: There are lots of other tricks we could apply for increasing
+  // encoding size without impacting performance.
+  bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+  return getRelaxedOpcode(Inst, is16BitMode) != Inst.getOpcode();
+}
+
+
 bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
                                          uint64_t Value,
                                          const MCRelaxableFragment *DF,
@@ -616,6 +628,8 @@
 void X86AsmBackend::relaxInstruction(const MCInst &Inst,
                                      const MCSubtargetInfo &STI,
                                      MCInst &Res) const {
+  assert(canBeFurtherRelaxed(Inst, STI));
+
   // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
   bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
   unsigned RelaxedOp = getRelaxedOpcode(Inst, is16BitMode);
Index: llvm/test/MC/X86/align-branch-64.s
===================================================================
--- llvm/test/MC/X86/align-branch-64.s
+++ llvm/test/MC/X86/align-branch-64.s
@@ -103,6 +103,24 @@
 bar:
   retq
 
+  # CHECK: test_pad_via_relax:
+  # CHECK: 200: testq
+  # CHECK: 203: jne
+  # CHECK: 209: int3
+  # note 6 byte jne which could be a 2 byte jne, but is instead
+  # expanded for padding purposes
+  # CHECK-NOT: nop
+  # CHECK: 220: callq
+  .global test_pad_via_relax
+  .p2align  5
+test_pad_via_relax:
+  testq %rax, %rax
+  jnz bar
+  .rept 23
+  int3
+  .endr
+  callq bar
+
   .section "unknown"
   .p2align 4
   .type   baz,@function
Index: llvm/test/MC/X86/align-via-relaxation.s
===================================================================
--- /dev/null
+++ llvm/test/MC/X86/align-via-relaxation.s
@@ -0,0 +1,74 @@
+  # RUN: llvm-mc -mcpu=skylake -filetype=obj -triple x86_64-pc-linux-gnu %s | llvm-objdump -d --section=.text - | FileCheck %s
+
+
+  .file	"test.c"
+	.text
+	.section	.text
+  # Demonstrate that we can relax instructions to provide padding, not
+  # just insert nops.  jmps are being used for ease of demonstration.
+  # CHECK: .text
+  # CHECK: 0: eb 1f                        	jmp 31 <foo>
+  # CHECK: 2: e9 1a 00 00 00               	jmp 26 <foo>
+  # CHECK: 7: e9 15 00 00 00               	jmp 21 <foo>
+  # CHECK: c: e9 10 00 00 00               	jmp 16 <foo>
+  # CHECK: 11: e9 0b 00 00 00               jmp 11 <foo>
+  # CHECK: 16: e9 06 00 00 00               jmp 6 <foo>
+  # CHECK: 1b: e9 01 00 00 00               jmp 1 <foo>
+  # CHECK: 20: cc                           int3
+	.p2align 4
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+	.p2align 5
+  int3
+foo:
+  ret
+
+  # Check that we're not shifting aroudn the offsets of labels - doing
+  # that would require a further round of relaxation
+  # CHECK: bar:
+  # CHECK: 22: eb fe                          jmp -2 <bar>
+  # CHECK: 24: 66 2e 0f 1f 84 00 00 00 00 00	nopw %cs:(%rax,%rax)
+  # CHECK: 2e: 66 90                        	nop
+  # CHECK: 30: 0f 0b                        	ud2
+
+bar:  
+  jmp bar
+nobypass:
+  .p2align 4
+  ud2
+
+
+  # Canonical toy loop to show benefit - we can align the loop header with
+  # fewer nops by relaxing the branch, even though we don't need to
+  # CHECK: loop_preheader:
+  # CHECK: 45: 48 85 c0                     	testq %rax, %rax
+  # CHECK: 48: 0f 8e 22 00 00 00            	jle 34 <loop_exit>
+  # CHECK: 4e: 66 2e 0f 1f 84 00 00 00 00 00	nopw %cs:(%rax,%rax)
+  # CHECK: 58: 0f 1f 84 00 00 00 00 00      	nopl (%rax,%rax)
+  # CHECK: loop_header:
+  # CHECK: 60: 48 83 e8 01                  	subq $1, %rax
+  # CHECK: 64: 48 85 c0                     	testq %rax, %rax
+  # CHECK: 67: 7e 07                        	jle 7 <loop_exit>
+  # CHECK: 69: e9 f2 ff ff ff               	jmp -14 <loop_header>
+  # CHECK: 6e: 66 90                        	nop
+  # CHECK: loop_exit:
+  # CHECK: 70: c3                           	retq
+  .p2align 5
+  .skip 5
+loop_preheader:
+  testq %rax, %rax
+  jle loop_exit
+  .p2align 5
+loop_header:
+  subq $1, %rax
+  testq %rax, %rax
+  jle loop_exit
+  jmp loop_header
+  .p2align 4
+loop_exit:
+  ret