Index: llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
===================================================================
--- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -815,6 +815,43 @@
   return !hasVariantSymbol(Inst);
 }
 
+/// Return the number of prefixes which can be added to an instruction without
+/// encountering decoder delays for the given target.  Note that the result is
+/// inclusive of escape bytes and manditory prefixes.
+static unsigned maximumPrefixPadding(const MCSubtargetInfo &STI) {
+  // Summary of decoder behavior (per Agner's guide)
+  // AMD
+  //   K8 and K10 - three prefixes per instruction per cycle (i.e. no stall)
+  //   Bulldozer,Piledriver, Excavator, Steamroller - three prefixes per
+  //     instruction or 10-15 cycle stall
+  //   Ryzen - any number of prefixes in a single cycle
+  //   Bobcat, Jaquar - no penalty for multiple prefixes
+  // Intel
+  //   SandyBridge and later - can decode any number of prefixes in a single
+  //     cycle. 
+  //   Atom - Up to three prefix can be decoded in a single cycle.  (Severe
+  //     delay for more)
+  //   Silvermont - can decode three prefixes *and escape sequences* in a
+  //     single cycle (severe stall)
+  //   Goldmont - can decode (most combinations of) four prefixes in a single
+  //     cycle.
+  //   Knights Landing - can decode three prefixes and escape bytes in a single
+  //     cycle (5-6 cycle stall if exceeded)
+
+  // We use maximum nop size as a proxy for the number of prefixes we can
+  // safely execute.  Since nops longer than 8 are formed by adding redundant
+  // prefxes, this seems like a reasonable proxy. 
+  if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP])
+    // TODO: This should probably be '3' once we validate that escape sequences
+    // are conservatively handled.
+    return 0;
+  else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
+    return 15;
+  else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
+    return 3;
+  return 2;
+}
+
 static unsigned getRemainingPrefixSize(const MCInst &Inst,
                                        const MCSubtargetInfo &STI,
                                        MCCodeEmitter &Emitter) {
@@ -823,13 +860,11 @@
   Emitter.emitPrefix(Inst, VecOS, STI);
   assert(Code.size() < 15 && "The number of prefixes must be less than 15.");
 
-  // TODO: It turns out we need a decent amount of plumbing for the target
-  // specific bits to determine number of prefixes its safe to add.  Various
-  // targets (older chips mostly, but also Atom family) encounter decoder
-  // stalls with too many prefixes.  For testing purposes, we set the value
-  // externally for the moment.
   unsigned ExistingPrefixSize = Code.size();
-  unsigned TargetPrefixMax = X86PadMaxPrefixSize;
+  unsigned TargetPrefixMax = maximumPrefixPadding(STI);
+  if (X86PadMaxPrefixSize.getNumOccurrences())
+    TargetPrefixMax = X86PadMaxPrefixSize;
+  
   if (TargetPrefixMax <= ExistingPrefixSize)
     return 0;
   return TargetPrefixMax - ExistingPrefixSize;
Index: llvm/test/MC/X86/align-branch-64-hardcode.s
===================================================================
--- llvm/test/MC/X86/align-branch-64-hardcode.s
+++ llvm/test/MC/X86/align-branch-64-hardcode.s
@@ -7,7 +7,7 @@
 
   # CHECK: 1d:       int3
   # CHECK: 1e:       jmp
-  # CHECK: 24:       int3
+  # CHECK: 26:       int3
   .p2align  5
   .rept 30
   int3
Index: llvm/test/MC/X86/align-branch-64-prefix.s
===================================================================
--- llvm/test/MC/X86/align-branch-64-prefix.s
+++ llvm/test/MC/X86/align-branch-64-prefix.s
@@ -6,8 +6,8 @@
   .text
 
   # CHECK: 1d:       int3
-  # CHECK: 1e:       jmp
-  # CHECK: 24:       int3
+  # CHECK-NEXT: 1e:       jmp
+  # CHECK-NEXT: 26:       int3
   .p2align  5
   .rept 30
   int3
@@ -17,8 +17,8 @@
   int3
 
   # CHECK: 5d:       int3
-  # CHECK: 5e:       jmp
-  # CHECK: 64:       int3
+  # CHECK-NEXT: 5e:       jmp
+  # CHECK-NEXT: 66:       int3
   .p2align  5
   .rept 30
   int3
@@ -28,8 +28,8 @@
   int3
 
   # CHECK: 9d:       int3
-  # CHECK: 9e:       call
-  # CHECK: a6:       int3
+  # CHECK-NEXT: 9e:       call
+  # CHECK-NEXT: a6:       int3
   .p2align  5
   .rept 30
   int3
@@ -39,8 +39,8 @@
   int3
 
   # CHECK: de:       lock
-  # CHECK: df:       jmp
-  # CHECK: e4:       int3
+  # CHECK-NEXT: df:       jmp
+  # CHECK-NEXT: e6:       int3
   .p2align  5
   .rept 30
   int3
@@ -49,9 +49,11 @@
   jmp baz
   int3
 
+  # Note: This one is possible a decoder bug, gnu shows "rex.w, cs cs jmp 0x8"
   # CHECK: 11d:       int3
-  # CHECK: 11e:       jmp
-  # CHECK: 124:       int3
+  # CHECK-NEXT: 11e:       cs
+  # CHECK-NEXT: 120:       jmp
+  # CHECK-NEXT: 126:       int3
   .p2align  5
   .rept 30
   int3
@@ -61,8 +63,8 @@
   int3
 
   # CHECK: 15d:      int3
-  # CHECK: 15e:      {{.*}} jmp
-  # CHECK: 164:      int3
+  # CHECK-NEXT: 15e:      {{.*}} jmp
+  # CHECK-NEXT: 164:      int3
   .p2align  5
   .rept 30
   int3