Index: include/llvm/MC/MCAsmBackend.h
===================================================================
--- include/llvm/MC/MCAsmBackend.h
+++ include/llvm/MC/MCAsmBackend.h
@@ -116,11 +116,20 @@
   ///
   virtual unsigned getMinimumNopSize() const { return 1; }
 
-  /// Write an (optimal) nop sequence of Count bytes to the given output. If the
-  /// target cannot generate such a sequence, it should return an error.
+  /// \brief Returns a list of tokens to be used with .arch directive to specify
+  /// preferences for expanding .p2align into nop instructions.
+  virtual SmallVector<std::string, 1>
+  getNopPrefs(const MCSubtargetInfo &STI) const {
+    return SmallVector<std::string, 1>();
+  }
+
+  /// Write an (optimal for \p STI subtarget) nop sequence of \p Count bytes to
+  /// the given output. If the target cannot generate such a sequence, it should
+  /// return an error.
   ///
   /// \return - True on success.
-  virtual bool writeNopData(uint64_t Count, MCObjectWriter *OW) const = 0;
+  virtual bool writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                            MCObjectWriter *OW) const = 0;
 
   /// Give backend an opportunity to finish layout after relaxation
   virtual void finishLayout(MCAssembler const &Asm,
Index: include/llvm/MC/MCFragment.h
===================================================================
--- include/llvm/MC/MCFragment.h
+++ include/llvm/MC/MCFragment.h
@@ -220,11 +220,19 @@
 /// Fragment for data and encoded instructions.
 ///
 class MCDataFragment : public MCEncodedFragmentWithFixups<32, 4> {
+  /// STI - The MCSubtargetInfo for padding emission.
+  const MCSubtargetInfo *STI;
+
 public:
   MCDataFragment(MCSection *Sec = nullptr)
-      : MCEncodedFragmentWithFixups<32, 4>(FT_Data, false, Sec) {}
+      : MCEncodedFragmentWithFixups<32, 4>(FT_Data, false, Sec), STI(nullptr) {}
+
+  void setHasInstructions(bool V, const MCSubtargetInfo *S) {
+    HasInstructions = V;
+    STI = S;
+  }
 
-  void setHasInstructions(bool V) { HasInstructions = V; }
+  const MCSubtargetInfo *getSubtargetInfo() const { return STI; }
 
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_Data;
@@ -237,10 +245,16 @@
 /// consumption.
 ///
 class MCCompactEncodedInstFragment : public MCEncodedFragmentWithContents<4> {
+  /// STI - The MCSubtargetInfo for padding emission.
+  const MCSubtargetInfo &STI;
+
 public:
-  MCCompactEncodedInstFragment(MCSection *Sec = nullptr)
-      : MCEncodedFragmentWithContents(FT_CompactEncodedInst, true, Sec) {
-  }
+  MCCompactEncodedInstFragment(const MCSubtargetInfo &STI,
+                               MCSection *Sec = nullptr)
+      : MCEncodedFragmentWithContents(FT_CompactEncodedInst, true, Sec),
+        STI(STI) {}
+
+  const MCSubtargetInfo &getSubtargetInfo() const { return STI; }
 
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_CompactEncodedInst;
@@ -267,7 +281,7 @@
   const MCInst &getInst() const { return Inst; }
   void setInst(const MCInst &Value) { Inst = Value; }
 
-  const MCSubtargetInfo &getSubtargetInfo() { return STI; }
+  const MCSubtargetInfo &getSubtargetInfo() const { return STI; }
 
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_Relaxable;
@@ -294,12 +308,16 @@
   /// cannot be satisfied in this width then this fragment is ignored.
   unsigned MaxBytesToEmit;
 
+  /// STI - Subtarget info to specify which nop instructions to emit.
+  const MCSubtargetInfo &STI;
+
 public:
   MCAlignFragment(unsigned Alignment, int64_t Value, unsigned ValueSize,
-                  unsigned MaxBytesToEmit, MCSection *Sec = nullptr)
+                  unsigned MaxBytesToEmit, const MCSubtargetInfo &STI,
+                  MCSection *Sec = nullptr)
       : MCFragment(FT_Align, false, 0, Sec), Alignment(Alignment),
-        EmitNops(false), Value(Value),
-        ValueSize(ValueSize), MaxBytesToEmit(MaxBytesToEmit) {}
+        EmitNops(false), Value(Value), ValueSize(ValueSize),
+        MaxBytesToEmit(MaxBytesToEmit), STI(STI) {}
 
   /// \name Accessors
   /// @{
@@ -312,6 +330,8 @@
 
   unsigned getMaxBytesToEmit() const { return MaxBytesToEmit; }
 
+  const MCSubtargetInfo &getSubtargetInfo() const { return STI; }
+
   bool hasEmitNops() const { return EmitNops; }
   void setEmitNops(bool Value) { EmitNops = Value; }
 
Index: include/llvm/MC/MCObjectStreamer.h
===================================================================
--- include/llvm/MC/MCObjectStreamer.h
+++ include/llvm/MC/MCObjectStreamer.h
@@ -146,6 +146,7 @@
                 SMLoc Loc = SMLoc()) override;
   void emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr,
                 SMLoc Loc = SMLoc()) override;
+  void EmitArch(StringRef Token) override;
 
   void FinishImpl() override;
 
Index: include/llvm/MC/MCStreamer.h
===================================================================
--- include/llvm/MC/MCStreamer.h
+++ include/llvm/MC/MCStreamer.h
@@ -190,6 +190,9 @@
   unsigned NextWinCFIID = 0;
 
 protected:
+  const MCSubtargetInfo *STI = nullptr;
+
+protected:
   MCStreamer(MCContext &Ctx);
 
   virtual void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame);
@@ -213,6 +216,13 @@
     TargetStreamer.reset(TS);
   }
 
+  const MCSubtargetInfo &getSubtarget() const {
+    assert(STI && "STI object should be always available.");
+    return *STI;
+  }
+
+  void setSubtarget(const MCSubtargetInfo &S) { STI = &S; }
+
   /// State management
   ///
   virtual void reset();
@@ -758,6 +768,9 @@
   virtual void EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except);
   virtual void EmitWinEHHandlerData();
 
+  virtual void EmitArch(StringRef Token);
+  void EmitArchForAlignment(const MCAsmBackend &MAB);
+
   /// Get the .pdata section used for the given section. Typically the given
   /// section is either the main .text section or some other COMDAT .text
   /// section, but it may be any section containing code.
Index: include/llvm/MC/MCSubtargetInfo.h
===================================================================
--- include/llvm/MC/MCSubtargetInfo.h
+++ include/llvm/MC/MCSubtargetInfo.h
@@ -63,9 +63,10 @@
   const Triple &getTargetTriple() const { return TargetTriple; }
 
   /// getCPU - Return the CPU string.
-  StringRef getCPU() const {
-    return CPU;
-  }
+  StringRef getCPU() const { return CPU; }
+
+  /// setCPU - Set the CPU string.
+  void setCPU(const StringRef NewCPU) { CPU = NewCPU.str(); }
 
   /// getFeatureBits - Return the feature bits.
   ///
@@ -164,6 +165,12 @@
     auto Found = std::lower_bound(ProcDesc.begin(), ProcDesc.end(), CPU);
     return Found != ProcDesc.end() && StringRef(Found->Key) == CPU;
   }
+
+  /// Check whether the Feature string is valid.
+  bool isFeatureStringValid(StringRef Feature) const {
+    auto Found = std::lower_bound(ProcFeatures.begin(), ProcFeatures.end(), Feature);
+    return Found != ProcFeatures.end() && StringRef(Found->Key) == Feature;
+  }
 };
 
 } // End llvm namespace
Index: include/llvm/Support/TargetRegistry.h
===================================================================
--- include/llvm/Support/TargetRegistry.h
+++ include/llvm/Support/TargetRegistry.h
@@ -22,6 +22,7 @@
 #include "llvm-c/Disassembler.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/FormattedStream.h"
 #include <cassert>
@@ -40,7 +41,6 @@
 class MCInstPrinter;
 class MCInstrInfo;
 class MCRegisterInfo;
-class MCStreamer;
 class MCSubtargetInfo;
 class MCSymbolizer;
 class MCRelocationInfo;
@@ -461,6 +461,7 @@
     }
     if (ObjectTargetStreamerCtorFn)
       ObjectTargetStreamerCtorFn(*S, STI);
+    S->setSubtarget(STI);
     return S;
   }
 
@@ -468,12 +469,14 @@
                                 std::unique_ptr<formatted_raw_ostream> OS,
                                 bool IsVerboseAsm, bool UseDwarfDirectory,
                                 MCInstPrinter *InstPrint, MCCodeEmitter *CE,
-                                MCAsmBackend *TAB, bool ShowInst) const {
+                                MCAsmBackend *TAB, bool ShowInst,
+                                const MCSubtargetInfo &STI) const {
     formatted_raw_ostream &OSRef = *OS;
     MCStreamer *S = llvm::createAsmStreamer(Ctx, std::move(OS), IsVerboseAsm,
                                             UseDwarfDirectory, InstPrint, CE,
                                             TAB, ShowInst);
     createAsmTargetStreamer(*S, OSRef, InstPrint, IsVerboseAsm);
+    S->setSubtarget(STI);
     return S;
   }
 
Index: lib/CodeGen/AsmPrinter/AsmPrinter.cpp
===================================================================
--- lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1263,6 +1263,11 @@
 
 void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   this->MF = &MF;
+
+  // Pass STI to the streamer to be able to generate long nops according to
+  // the function-specific subtarget information.
+  OutStreamer->setSubtarget(MF.getSubtarget());
+
   // Get the function symbol.
   CurrentFnSym = getSymbol(MF.getFunction());
   CurrentFnSymForSize = CurrentFnSym;
Index: lib/CodeGen/LLVMTargetMachine.cpp
===================================================================
--- lib/CodeGen/LLVMTargetMachine.cpp
+++ lib/CodeGen/LLVMTargetMachine.cpp
@@ -235,7 +235,7 @@
     MCStreamer *S = getTarget().createAsmStreamer(
         *Context, std::move(FOut), Options.MCOptions.AsmVerbose,
         Options.MCOptions.MCUseDwarfDirectory, InstPrinter, MCE, MAB,
-        Options.MCOptions.ShowMCInst);
+        Options.MCOptions.ShowMCInst, STI);
     AsmStreamer.reset(S);
     break;
   }
Index: lib/MC/MCAsmStreamer.cpp
===================================================================
--- lib/MC/MCAsmStreamer.cpp
+++ lib/MC/MCAsmStreamer.cpp
@@ -280,6 +280,8 @@
   /// indicated by the hasRawTextSupport() predicate.
   void EmitRawTextImpl(StringRef String) override;
 
+  void EmitArch(StringRef Token) override;
+
   void FinishImpl() override;
 };
 
@@ -915,6 +917,9 @@
 void MCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
                                          unsigned ValueSize,
                                          unsigned MaxBytesToEmit) {
+  if (AsmBackend)
+    EmitArchForAlignment(*AsmBackend);
+
   // Some assemblers don't support non-power of two alignments, so we always
   // emit alignments as a power of two if possible.
   if (isPowerOf2_32(ByteAlignment)) {
@@ -1594,6 +1599,11 @@
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitArch(StringRef Token) {
+  OS << "\t.arch\t" << Token;
+  EmitEOL();
+}
+
 void MCAsmStreamer::FinishImpl() {
   // If we are generating dwarf for assembly source files dump out the sections.
   if (getContext().getGenDwarfForAssembly())
Index: lib/MC/MCAssembler.cpp
===================================================================
--- lib/MC/MCAssembler.cpp
+++ lib/MC/MCAssembler.cpp
@@ -390,11 +390,30 @@
   // Should NOP padding be written out before this fragment?
   unsigned BundlePadding = F.getBundlePadding();
   if (BundlePadding > 0) {
+    const MCSubtargetInfo *STI;
+
     assert(isBundlingEnabled() &&
            "Writing bundle padding with disabled bundling");
     assert(F.hasInstructions() &&
            "Writing bundle padding for a fragment without instructions");
 
+    // Get subtarget information for emitting padding.
+    switch (F.getKind()) {
+    case MCFragment::FT_CompactEncodedInst:
+      STI = &cast<MCCompactEncodedInstFragment>(F).getSubtargetInfo();
+      break;
+    case MCFragment::FT_Data:
+      STI = cast<MCDataFragment>(F).getSubtargetInfo();
+      break;
+    case MCFragment::FT_Relaxable:
+      STI = &cast<MCRelaxableFragment>(F).getSubtargetInfo();
+      break;
+    default:
+      llvm_unreachable("Unexpected fragment kind.");
+    }
+
+    assert(STI && "Subtarget information must be available.");
+
     unsigned TotalLength = BundlePadding + static_cast<unsigned>(FSize);
     if (F.alignToBundleEnd() && TotalLength > getBundleAlignSize()) {
       // If the padding itself crosses a bundle boundary, it must be emitted
@@ -406,12 +425,12 @@
       // ----------------------------
       //        ^-------------------^   <- TotalLength
       unsigned DistanceToBoundary = TotalLength - getBundleAlignSize();
-      if (!getBackend().writeNopData(DistanceToBoundary, OW))
+      if (!getBackend().writeNopData(DistanceToBoundary, *STI, OW))
           report_fatal_error("unable to write NOP sequence of " +
                              Twine(DistanceToBoundary) + " bytes");
       BundlePadding -= DistanceToBoundary;
     }
-    if (!getBackend().writeNopData(BundlePadding, OW))
+    if (!getBackend().writeNopData(BundlePadding, *STI, OW))
       report_fatal_error("unable to write NOP sequence of " +
                          Twine(BundlePadding) + " bytes");
   }
@@ -456,7 +475,7 @@
     // bytes left to fill use the Value and ValueSize to fill the rest.
     // If we are aligning with nops, ask that target to emit the right data.
     if (AF.hasEmitNops()) {
-      if (!Asm.getBackend().writeNopData(Count, OW))
+      if (!Asm.getBackend().writeNopData(Count, AF.getSubtargetInfo(), OW))
         report_fatal_error("unable to write nop sequence of " +
                           Twine(Count) + " bytes");
       break;
Index: lib/MC/MCELFStreamer.cpp
===================================================================
--- lib/MC/MCELFStreamer.cpp
+++ lib/MC/MCELFStreamer.cpp
@@ -82,7 +82,7 @@
                                  DF->getContents().size());
     DF->getFixups().push_back(EF->getFixups()[i]);
   }
-  DF->setHasInstructions(true);
+  DF->setHasInstructions(true, &getSubtarget());
   DF->getContents().append(EF->getContents().begin(), EF->getContents().end());
 }
 
@@ -516,7 +516,8 @@
       // Optimize memory usage by emitting the instruction to a
       // MCCompactEncodedInstFragment when not in a bundle-locked group and
       // there are no fixups registered.
-      MCCompactEncodedInstFragment *CEIF = new MCCompactEncodedInstFragment();
+      MCCompactEncodedInstFragment *CEIF =
+          new MCCompactEncodedInstFragment(getSubtarget());
       insert(CEIF);
       CEIF->getContents().append(Code.begin(), Code.end());
       return;
@@ -544,7 +545,7 @@
     Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
     DF->getFixups().push_back(Fixups[i]);
   }
-  DF->setHasInstructions(true);
+  DF->setHasInstructions(true, &getSubtarget());
   DF->getContents().append(Code.begin(), Code.end());
 
   if (Assembler.isBundlingEnabled() && Assembler.getRelaxAll()) {
Index: lib/MC/MCMachOStreamer.cpp
===================================================================
--- lib/MC/MCMachOStreamer.cpp
+++ lib/MC/MCMachOStreamer.cpp
@@ -427,7 +427,8 @@
 
   // Emit an align fragment if necessary.
   if (ByteAlignment != 1)
-    new MCAlignFragment(ByteAlignment, 0, 0, ByteAlignment, Section);
+    new MCAlignFragment(ByteAlignment, 0, 0, ByteAlignment, getSubtarget(),
+                        Section);
 
   MCFragment *F = new MCFillFragment(0, Size, Section);
   Symbol->setFragment(F);
Index: lib/MC/MCObjectStreamer.cpp
===================================================================
--- lib/MC/MCObjectStreamer.cpp
+++ lib/MC/MCObjectStreamer.cpp
@@ -424,9 +424,12 @@
                                             int64_t Value,
                                             unsigned ValueSize,
                                             unsigned MaxBytesToEmit) {
+  EmitArchForAlignment(Assembler->getBackend());
+
   if (MaxBytesToEmit == 0)
     MaxBytesToEmit = ByteAlignment;
-  insert(new MCAlignFragment(ByteAlignment, Value, ValueSize, MaxBytesToEmit));
+  insert(new MCAlignFragment(ByteAlignment, Value, ValueSize, MaxBytesToEmit,
+                             getSubtarget()));
 
   // Update the maximum alignment on the current section if necessary.
   MCSection *CurSec = getCurrentSection().first;
@@ -534,6 +537,35 @@
   MCStreamer::emitFill(IntNumValues, Size, Expr);
 }
 
+void MCObjectStreamer::EmitArch(StringRef Token) {
+  assert(Token.find(' ') == std::string::npos &&
+         Token.find('\t') == std::string::npos &&
+         ".arch directive has too many arguments.");
+
+  MCSubtargetInfo &NewSTI = getContext().getSubtargetCopy(getSubtarget());
+
+  if (Token.startswith(".")) {
+    // .arch is used to enable/disable a CPU feature.
+    StringRef Feature = Token.substr(1);
+
+    bool EnableFeature = true;
+    if (Feature.startswith_lower("no")) {
+      EnableFeature = false;
+      Feature = Feature.substr(2);
+    }
+
+    assert(NewSTI.isFeatureStringValid(Feature) && "Unknown CPU feature.");
+    NewSTI.ApplyFeatureFlag((EnableFeature ? "+" : "-") + Feature.str());
+  } else {
+    // .arch is used to specify target CPU.
+    assert(NewSTI.isCPUStringValid(Token) && "Unknown CPU name.");
+    NewSTI.setDefaultFeatures(Token, "");
+    NewSTI.setCPU(Token);
+  }
+
+  setSubtarget(NewSTI);
+}
+
 void MCObjectStreamer::FinishImpl() {
   // If we are generating dwarf for assembly source files dump out the sections.
   if (getContext().getGenDwarfForAssembly())
Index: lib/MC/MCStreamer.cpp
===================================================================
--- lib/MC/MCStreamer.cpp
+++ lib/MC/MCStreamer.cpp
@@ -23,12 +23,21 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCWin64EH.h"
 #include "llvm/Support/COFF.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
 using namespace llvm;
 
+// Emitting .arch directive hurts compatibility with assemblers other than LLVM,
+// so don't preserve alignment instructions across assembly by default.
+static cl::opt<bool>
+    AsmPreserveAlignment("asm-preserve-alignment", cl::Hidden,
+                         cl::desc("Preserve alignment instructions across "
+                                  "assembly by emitting .arch directive"),
+                         cl::init(false));
+
 // Pin the vtables to this file.
 MCTargetStreamer::~MCTargetStreamer() {}
 
@@ -518,6 +527,18 @@
     report_fatal_error("Chained unwind areas can't have handlers!");
 }
 
+void MCStreamer::EmitArch(StringRef Token) {}
+
+void MCStreamer::EmitArchForAlignment(const MCAsmBackend &MAB) {
+  if (!AsmPreserveAlignment)
+    return;
+
+  assert(STI && "Subtarget info must be available.");
+  SmallVector<std::string, 1> Prefs = MAB.getNopPrefs(*STI);
+  for (auto T : Prefs)
+    EmitArch(T);
+}
+
 static MCSection *getWinCFISection(MCContext &Context, unsigned *NextWinCFIID,
                                    MCSection *MainCFISec,
                                    const MCSection *TextSec) {
Index: lib/MC/WinCOFFStreamer.cpp
===================================================================
--- lib/MC/WinCOFFStreamer.cpp
+++ lib/MC/WinCOFFStreamer.cpp
@@ -249,7 +249,7 @@
 
   if (ByteAlignment != 1)
     new MCAlignFragment(ByteAlignment, /*Value=*/0, /*ValueSize=*/0,
-                        ByteAlignment, Section);
+                        ByteAlignment, getSubtarget(), Section);
 
   MCFillFragment *Fragment = new MCFillFragment(
       /*Value=*/0, Size, Section);
Index: lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
===================================================================
--- lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -80,7 +80,8 @@
                             const MCAsmLayout &Layout) const override;
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override;
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                    MCObjectWriter *OW) const override;
 
   void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
 
@@ -319,7 +320,8 @@
   llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented");
 }
 
-bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool AArch64AsmBackend::writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                                     MCObjectWriter *OW) const {
   // If the count is not 4-byte aligned, we must be writing data into the text
   // section (otherwise we have unaligned instructions, and thus have far
   // bigger problems), so just write zeros instead.
Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
===================================================================
--- lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -58,7 +58,8 @@
     assert(!"Not implemented");
   }
   bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                    MCObjectWriter *OW) const override;
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 };
@@ -141,7 +142,8 @@
   return Infos[Kind - FirstTargetFixupKind];
 }
 
-bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool AMDGPUAsmBackend::writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                                    MCObjectWriter *OW) const {
   OW->WriteZeros(Count);
 
   return true;
Index: lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
===================================================================
--- lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -66,7 +66,8 @@
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override;
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                    MCObjectWriter *OW) const override;
 
   void handleAssemblerFlag(MCAssemblerFlag Flag) override;
 
Index: lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
===================================================================
--- lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -289,7 +289,8 @@
   Res.setOpcode(RelaxedOp);
 }
 
-bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool ARMAsmBackend::writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                                 MCObjectWriter *OW) const {
   const uint16_t Thumb1_16bitNopEncoding = 0x46c0; // using MOV r8,r8
   const uint16_t Thumb2_16bitNopEncoding = 0xbf00; // NOP
   const uint32_t ARMv4_NopEncoding = 0xe1a00000;   // using MOV r0,r0
Index: lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
===================================================================
--- lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -50,10 +50,12 @@
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override {}
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                    MCObjectWriter *OW) const override;
 };
 
-bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool BPFAsmBackend::writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                                 MCObjectWriter *OW) const {
   if ((Count % 8) != 0)
     return false;
 
Index: lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
===================================================================
--- lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -665,8 +665,8 @@
     assert(Update && "Didn't find relaxation target");
   }
 
-  bool writeNopData(uint64_t Count,
-                    MCObjectWriter * OW) const override {
+  bool writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                    MCObjectWriter *OW) const override {
     static const uint32_t Nopcode  = 0x7f000000, // Hard-coded NOP.
                           ParseIn  = 0x00004000, // In packet parse-bits.
                           ParseEnd = 0x0000c000; // End of packet parse-bits.
Index: lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
===================================================================
--- lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -75,10 +75,12 @@
                         const MCSubtargetInfo & /*STI*/,
                         MCInst & /*Res*/) const override {}
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                    MCObjectWriter *OW) const override;
 };
 
-bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool LanaiAsmBackend::writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                                   MCObjectWriter *OW) const {
   if ((Count % 4) != 0)
     return false;
 
Index: lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
===================================================================
--- lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -80,7 +80,8 @@
 
   /// @}
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                    MCObjectWriter *OW) const override;
 
   void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
                          const MCFixup &Fixup, const MCFragment *DF,
Index: lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
===================================================================
--- lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -455,7 +455,8 @@
 /// it should return an error.
 ///
 /// \return - True on success.
-bool MipsAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool MipsAsmBackend::writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                                  MCObjectWriter *OW) const {
   // Check for a less than instruction size number of bytes
   // FIXME: 16 bit instructions are not handled yet here.
   // We shouldn't be using a hard coded number for instruction size.
Index: lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
===================================================================
--- lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -174,7 +174,8 @@
     llvm_unreachable("relaxInstruction() unimplemented");
   }
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
+  bool writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                    MCObjectWriter *OW) const override {
     uint64_t NumNops = Count / 4;
     for (uint64_t i = 0; i != NumNops; ++i)
       OW->write32(0x60000000);
Index: lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
===================================================================
--- lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -254,7 +254,8 @@
       llvm_unreachable("relaxInstruction() unimplemented");
     }
 
-    bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
+    bool writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                      MCObjectWriter *OW) const override {
       // Cannot emit NOP with size not multiple of 32 bits.
       if (Count % 4 != 0)
         return false;
Index: lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
===================================================================
--- lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -62,7 +62,8 @@
                         MCInst &Res) const override {
     llvm_unreachable("SystemZ does do not have assembler relaxation");
   }
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                    MCObjectWriter *OW) const override;
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createSystemZObjectWriter(OS, OSABI);
   }
@@ -104,6 +105,7 @@
 }
 
 bool SystemZMCAsmBackend::writeNopData(uint64_t Count,
+                                       const MCSubtargetInfo &STI,
                                        MCObjectWriter *OW) const {
   for (uint64_t I = 0; I != Count; ++I)
     OW->write8(7);
Index: lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
===================================================================
--- lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -58,10 +58,12 @@
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override {}
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                    MCObjectWriter *OW) const override;
 };
 
 bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
+                                         const MCSubtargetInfo &STI,
                                          MCObjectWriter *OW) const {
   if (Count == 0)
     return true;
Index: lib/Target/X86/AsmParser/X86AsmParser.cpp
===================================================================
--- lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -718,6 +718,7 @@
   bool parseDirectiveEven(SMLoc L);
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
+  bool ParseDirectiveArch(SMLoc L);
 
   bool processInstruction(MCInst &Inst, const OperandVector &Ops);
 
@@ -2927,6 +2928,8 @@
     return false;
   } else if (IDVal == ".even")
     return parseDirectiveEven(DirectiveID.getLoc());
+  else if (IDVal == ".arch")
+    return ParseDirectiveArch(DirectiveID.getLoc());
   return true;
 }
 
@@ -2948,6 +2951,7 @@
     getStreamer().EmitValueToAlignment(2, 0, 1, 0);
   return false;
 }
+
 /// ParseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
 bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
@@ -3015,6 +3019,60 @@
   return false;
 }
 
+/// ParseDirectiveArch
+///  ::= .arch cpu
+///  ::= .arch .[no]feature
+bool X86AsmParser::ParseDirectiveArch(SMLoc L) {
+  // Use parseStringToEndOfStatement rather than getTok sequence to handle
+  // feature names containing '-' character more easily.
+  StringRef Token = getParser().parseStringToEndOfStatement().trim();
+
+  // Make sure .arch has only one argument.
+  if (Token.find(' ') != std::string::npos ||
+      Token.find('\t') != std::string::npos) {
+    Error(L, "unexpected number of arguments in .arch directive");
+    return false;
+  }
+
+  MCSubtargetInfo &NewSTI = copySTI();
+
+  if (Token.startswith(".")) {
+    // .arch is used to enable/disable a CPU feature.
+    StringRef Feature = Token.substr(1);
+
+    bool EnableFeature = true;
+    if (Feature.startswith_lower("no")) {
+      EnableFeature = false;
+      Feature = Feature.substr(2);
+    }
+
+    if (!NewSTI.isFeatureStringValid(Feature)) {
+      Error(L, "unrecognized feature \'" + Feature + "\' in .arch directive");
+      return false;
+    }
+
+    NewSTI.ApplyFeatureFlag((EnableFeature ? "+" : "-") + Feature.str());
+  } else {
+    // .arch is used to specify target CPU.
+    // Preserve mode bit.
+    StringRef Mode = is64BitMode()
+                         ? "+64bit-mode"
+                         : (is32BitMode() ? "+32bit-mode" : "+16bit-mode");
+
+    if (!NewSTI.isCPUStringValid(Token)) {
+      Error(L, "unrecognized CPU \'" + Token + "\' in .arch directive");
+      return false;
+    }
+
+    NewSTI.setDefaultFeatures(Token, Mode);
+    NewSTI.setCPU(Token);
+  }
+
+  getParser().getStreamer().EmitArch(Token);
+
+  return false;
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeX86AsmParser() {
   RegisterMCAsmParser<X86AsmParser> X(TheX86_32Target);
Index: lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
===================================================================
--- lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -29,6 +29,12 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+// Long nop related feature bits.
+const struct LongNopDesc {
+  unsigned Bit;
+  StringRef Name;
+} LongNopFeatures[0] = {};
+
 static unsigned getFixupKindLog2Size(unsigned Kind) {
   switch (Kind) {
   default:
@@ -70,19 +76,8 @@
 };
 
 class X86AsmBackend : public MCAsmBackend {
-  const StringRef CPU;
-  bool HasNopl;
-  const uint64_t MaxNopLength;
 public:
-  X86AsmBackend(const Target &T, StringRef CPU)
-      : MCAsmBackend(), CPU(CPU),
-        MaxNopLength((CPU == "slm" || CPU == "lakemont") ? 7 : 15) {
-    HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
-              CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
-              CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
-              CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
-              CPU != "c3" && CPU != "c3-2";
-  }
+  X86AsmBackend(const Target &T) : MCAsmBackend() {}
 
   unsigned getNumFixupKinds() const override {
     return X86::NumTargetFixupKinds;
@@ -135,7 +130,31 @@
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override;
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  bool writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                    MCObjectWriter *OW) const override;
+
+  SmallVector<std::string, 1> getNopPrefs(const MCSubtargetInfo &STI) const {
+    SmallVector<std::string, 1> Prefs;
+
+    // Emit ".arch" directive only for valid CPUs.
+    if (!STI.isCPUStringValid(STI.getCPU()))
+      return Prefs;
+
+    // Emit ".arch cpu".
+    Prefs.push_back(STI.getCPU().str());
+
+    // Emit ".arch .[no]feature" if some of the long nop related features were
+    // explicitly changed from the default values.
+    MCSubtargetInfo DefaultSTI = MCSubtargetInfo(STI);
+    DefaultSTI.setDefaultFeatures(STI.getCPU(), "");
+    for (auto Desc : LongNopFeatures) {
+      bool Enabled = STI.getFeatureBits()[Desc.Bit];
+      if (DefaultSTI.getFeatureBits()[Desc.Bit] != Enabled)
+        Prefs.push_back((Enabled ? "." : ".no") + Desc.Name.str());
+    }
+
+    return Prefs;
+  }
 };
 } // end anonymous namespace
 
@@ -324,28 +343,37 @@
 /// \brief Write a sequence of optimal nops to the output, covering \p Count
 /// bytes.
 /// \return - true on success, false on failure
-bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+bool X86AsmBackend::writeNopData(uint64_t Count, const MCSubtargetInfo &STI,
+                                 MCObjectWriter *OW) const {
+  const StringRef CPU = STI.getCPU();
+  const bool HasNopl =
+      CPU != "generic" && CPU != "i386" && CPU != "i486" && CPU != "i586" &&
+      CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" &&
+      CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && CPU != "geode" &&
+      CPU != "winchip-c6" && CPU != "winchip2" && CPU != "c3" && CPU != "c3-2";
+  const uint64_t MaxNopLength = (CPU == "slm" || CPU == "lakemont") ? 7 : 15;
+
   static const uint8_t Nops[10][10] = {
-    // nop
-    {0x90},
-    // xchg %ax,%ax
-    {0x66, 0x90},
-    // nopl (%[re]ax)
-    {0x0f, 0x1f, 0x00},
-    // nopl 0(%[re]ax)
-    {0x0f, 0x1f, 0x40, 0x00},
-    // nopl 0(%[re]ax,%[re]ax,1)
-    {0x0f, 0x1f, 0x44, 0x00, 0x00},
-    // nopw 0(%[re]ax,%[re]ax,1)
-    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
-    // nopl 0L(%[re]ax)
-    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
-    // nopl 0L(%[re]ax,%[re]ax,1)
-    {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-    // nopw 0L(%[re]ax,%[re]ax,1)
-    {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-    // nopw %cs:0L(%[re]ax,%[re]ax,1)
-    {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+      // nop
+      {0x90},
+      // xchg %ax,%ax
+      {0x66, 0x90},
+      // nopl (%[re]ax)
+      {0x0f, 0x1f, 0x00},
+      // nopl 0(%[re]ax)
+      {0x0f, 0x1f, 0x40, 0x00},
+      // nopl 0(%[re]ax,%[re]ax,1)
+      {0x0f, 0x1f, 0x44, 0x00, 0x00},
+      // nopw 0(%[re]ax,%[re]ax,1)
+      {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+      // nopl 0L(%[re]ax)
+      {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+      // nopl 0L(%[re]ax,%[re]ax,1)
+      {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+      // nopw 0L(%[re]ax,%[re]ax,1)
+      {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+      // nopw %cs:0L(%[re]ax,%[re]ax,1)
+      {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   };
 
   // This CPU doesn't support long nops. If needed add more.
@@ -380,14 +408,14 @@
 class ELFX86AsmBackend : public X86AsmBackend {
 public:
   uint8_t OSABI;
-  ELFX86AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
-      : X86AsmBackend(T, CPU), OSABI(OSABI) {}
+  ELFX86AsmBackend(const Target &T, uint8_t OSABI)
+      : X86AsmBackend(T), OSABI(OSABI) {}
 };
 
 class ELFX86_32AsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
-    : ELFX86AsmBackend(T, OSABI, CPU) {}
+  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI)
+      : ELFX86AsmBackend(T, OSABI) {}
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
@@ -396,8 +424,8 @@
 
 class ELFX86_X32AsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
-      : ELFX86AsmBackend(T, OSABI, CPU) {}
+  ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI)
+      : ELFX86AsmBackend(T, OSABI) {}
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
@@ -407,8 +435,8 @@
 
 class ELFX86_IAMCUAsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
-      : ELFX86AsmBackend(T, OSABI, CPU) {}
+  ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI)
+      : ELFX86AsmBackend(T, OSABI) {}
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
@@ -418,11 +446,12 @@
 
 class ELFX86_64AsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
-    : ELFX86AsmBackend(T, OSABI, CPU) {}
+  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI)
+      : ELFX86AsmBackend(T, OSABI) {}
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
-    return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI,
+                                    ELF::EM_X86_64);
   }
 };
 
@@ -430,10 +459,8 @@
   bool Is64Bit;
 
 public:
-  WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU)
-    : X86AsmBackend(T, CPU)
-    , Is64Bit(is64Bit) {
-  }
+  WindowsX86AsmBackend(const Target &T, bool is64Bit)
+      : X86AsmBackend(T), Is64Bit(is64Bit) {}
 
   Optional<MCFixupKind> getFixupKind(StringRef Name) const override {
     return StringSwitch<Optional<MCFixupKind>>(Name)
@@ -784,9 +811,8 @@
   }
 
 public:
-  DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef CPU,
-                      bool Is64Bit)
-    : X86AsmBackend(T, CPU), MRI(MRI), Is64Bit(Is64Bit) {
+  DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, bool Is64Bit)
+      : X86AsmBackend(T), MRI(MRI), Is64Bit(Is64Bit) {
     memset(SavedRegs, 0, sizeof(SavedRegs));
     OffsetSize = Is64Bit ? 8 : 4;
     MoveInstrSize = Is64Bit ? 3 : 2;
@@ -796,9 +822,8 @@
 
 class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
 public:
-  DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                         StringRef CPU)
-      : DarwinX86AsmBackend(T, MRI, CPU, false) {}
+  DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI)
+      : DarwinX86AsmBackend(T, MRI, false) {}
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
@@ -808,17 +833,18 @@
 
   /// \brief Generate the compact unwind encoding for the CFI instructions.
   uint32_t generateCompactUnwindEncoding(
-                             ArrayRef<MCCFIInstruction> Instrs) const override {
+      ArrayRef<MCCFIInstruction> Instrs) const override {
     return generateCompactUnwindEncodingImpl(Instrs);
   }
 };
 
 class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
   const MachO::CPUSubTypeX86 Subtype;
+
 public:
   DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                         StringRef CPU, MachO::CPUSubTypeX86 st)
-      : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {}
+                         MachO::CPUSubTypeX86 st)
+      : DarwinX86AsmBackend(T, MRI, true), Subtype(st) {}
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
@@ -827,7 +853,7 @@
 
   /// \brief Generate the compact unwind encoding for the CFI instructions.
   uint32_t generateCompactUnwindEncoding(
-                             ArrayRef<MCCFIInstruction> Instrs) const override {
+      ArrayRef<MCCFIInstruction> Instrs) const override {
     return generateCompactUnwindEncodingImpl(Instrs);
   }
 };
@@ -840,17 +866,17 @@
                                            StringRef CPU,
                                            const MCTargetOptions &Options) {
   if (TheTriple.isOSBinFormatMachO())
-    return new DarwinX86_32AsmBackend(T, MRI, CPU);
+    return new DarwinX86_32AsmBackend(T, MRI);
 
   if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
-    return new WindowsX86AsmBackend(T, false, CPU);
+    return new WindowsX86AsmBackend(T, false);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
 
   if (TheTriple.isOSIAMCU())
-    return new ELFX86_IAMCUAsmBackend(T, OSABI, CPU);
+    return new ELFX86_IAMCUAsmBackend(T, OSABI);
 
-  return new ELFX86_32AsmBackend(T, OSABI, CPU);
+  return new ELFX86_32AsmBackend(T, OSABI);
 }
 
 MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
@@ -863,15 +889,15 @@
         StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
             .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
             .Default(MachO::CPU_SUBTYPE_X86_64_ALL);
-    return new DarwinX86_64AsmBackend(T, MRI, CPU, CS);
+    return new DarwinX86_64AsmBackend(T, MRI, CS);
   }
 
   if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
-    return new WindowsX86AsmBackend(T, true, CPU);
+    return new WindowsX86AsmBackend(T, true);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
 
   if (TheTriple.getEnvironment() == Triple::GNUX32)
-    return new ELFX86_X32AsmBackend(T, OSABI, CPU);
-  return new ELFX86_64AsmBackend(T, OSABI, CPU);
+    return new ELFX86_X32AsmBackend(T, OSABI);
+  return new ELFX86_64AsmBackend(T, OSABI);
 }
Index: test/CodeGen/X86/multiversioning-long-nops.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/multiversioning-long-nops.ll
@@ -0,0 +1,95 @@
+; .ll -> .o
+; RUN: llc < %s -mtriple i386-unknown-linux -filetype=obj -o - \
+; RUN:	| llvm-objdump -d -no-show-raw-insn - \
+; RUN:	| FileCheck %s
+
+; .ll -> .s -> .o
+; RUN: llc < %s -mtriple i386-unknown-linux -asm-preserve-alignment -o %t
+; RUN: cat %t | FileCheck -check-prefix=ASM %s
+; RUN: llvm-mc < %t -triple i386-unknown-linux -filetype=obj -o - \
+; RUN:	| llvm-objdump -d -no-show-raw-insn - \
+; RUN:	| FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+module asm "inc %eax"
+module asm ".p2align 3"
+module asm "inc %eax"
+
+; CHECK:	incl    %eax
+; CHECK:	nop
+; CHECK:	nop
+; CHECK:	nop
+; CHECK:	nop
+; CHECK:	nop
+; CHECK:	nop
+; CHECK:	nop
+; CHECK:	incl    %eax
+
+; ASM: 		incl    %eax
+; ASM:		.arch   generic
+; ASM-NEXT: 	.p2align        3, 0x90
+; ASM:		incl    %eax
+
+define void @test1() #0 {
+entry:
+  call void asm sideeffect "inc %eax", "~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect ".p2align 5", "~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect "inc %eax", "~{dirflag},~{fpsr},~{flags}"()
+  ret void
+
+; CHECK-LABEL: test1:
+; CHECK:	incl    %eax
+; CHECK:	nopw    %cs:(%eax,%eax)
+; CHECK:	incl    %eax
+
+; ASM-LABEL: test1:
+; ASM: 		incl    %eax
+; ASM:		.arch   pentium4
+; ASM-NEXT: 	.p2align        5, 0x90
+; ASM:		incl    %eax
+}
+
+define void @test2() #1 {
+entry:
+  call void asm sideeffect "inc %eax", "~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect ".p2align 5", "~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect "inc %eax", "~{dirflag},~{fpsr},~{flags}"()
+  ret void
+
+; CHECK-LABEL: test2:
+; CHECK:	incl    %eax
+; CHECK:	nopw    %cs:(%eax,%eax)
+; CHECK:	incl    %eax
+
+; ASM-LABEL: test2:
+; ASM: 		incl    %eax
+; ASM:		.arch   core-avx2
+; ASM-NEXT: 	.p2align        5, 0x90
+; ASM:		incl    %eax
+}
+
+define void @test3() #2 {
+entry:
+  call void asm sideeffect "inc %eax", "~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect ".p2align 5", "~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect "inc %eax", "~{dirflag},~{fpsr},~{flags}"()
+  ret void
+
+; CHECK-LABEL: test3:
+; CHECK:	incl    %eax
+; CHECK:	nopl    (%eax)
+; CHECK:	nopl    (%eax)
+; CHECK:	incl    %eax
+
+; ASM-LABEL: test3:
+; ASM: 		incl    %eax
+; ASM:		.arch   slm
+; ASM-NEXT: 	.p2align        5, 0x90
+; ASM:		incl    %eax
+}
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="slm" "target-features"="+aes,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/MC/COFF/align-nops.s
===================================================================
--- test/MC/COFF/align-nops.s
+++ test/MC/COFF/align-nops.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | llvm-readobj -s -sd | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 -mcpu=pentium4 %s | llvm-readobj -s -sd | FileCheck %s
 
 // Test that we get optimal nops in text
     .text
Index: test/MC/ELF/align-nops.s
===================================================================
--- test/MC/ELF/align-nops.s
+++ test/MC/ELF/align-nops.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -s -sd | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentium4 %s -o - | llvm-readobj -s -sd | FileCheck %s
 
 // Test that we get optimal nops in text
     .text
Index: test/MC/MachO/x86_32-optimal_nop.s
===================================================================
--- test/MC/MachO/x86_32-optimal_nop.s
+++ test/MC/MachO/x86_32-optimal_nop.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple i386-apple-darwin9 %s -filetype=obj -o - | llvm-readobj -file-headers -s -sd -r -t -macho-segment -macho-dysymtab -macho-indirect-symbols | FileCheck %s
+// RUN: llvm-mc -triple i386-apple-darwin9 -mcpu=pentium4 %s -filetype=obj -o - | llvm-readobj -file-headers -s -sd -r -t -macho-segment -macho-dysymtab -macho-indirect-symbols | FileCheck %s
 
 # 1 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
Index: test/MC/X86/AlignedBundling/different-sections.s
===================================================================
--- test/MC/X86/AlignedBundling/different-sections.s
+++ test/MC/X86/AlignedBundling/different-sections.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentium4 %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentium4 -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
 
 # Test two different executable sections with bundling.
Index: test/MC/X86/AlignedBundling/long-nop-pad.s
===================================================================
--- test/MC/X86/AlignedBundling/long-nop-pad.s
+++ test/MC/X86/AlignedBundling/long-nop-pad.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentium4 %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentium4 -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
 
 # Test that long nops are generated for padding where possible.
Index: test/MC/X86/AlignedBundling/misaligned-bundle-group.s
===================================================================
--- test/MC/X86/AlignedBundling/misaligned-bundle-group.s
+++ test/MC/X86/AlignedBundling/misaligned-bundle-group.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentium4 %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - \
 # RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-OPT %s
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentium4 -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - \
 # RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-RELAX %s
 
Index: test/MC/X86/AlignedBundling/misaligned-bundle.s
===================================================================
--- test/MC/X86/AlignedBundling/misaligned-bundle.s
+++ test/MC/X86/AlignedBundling/misaligned-bundle.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentium4 %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - \
 # RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-OPT %s
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentium4 -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - \
 # RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-RELAX %s
 
Index: test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
===================================================================
--- test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
+++ test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentium4 %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentium4 -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
 
 # Test some variations of padding to the end of a bundle.
Index: test/MC/X86/AlignedBundling/pad-bundle-groups.s
===================================================================
--- test/MC/X86/AlignedBundling/pad-bundle-groups.s
+++ test/MC/X86/AlignedBundling/pad-bundle-groups.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentium4 %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentium4 -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
 
 # Test some variations of padding for bundle-locked groups.
Index: test/MC/X86/AlignedBundling/relax-in-bundle-group.s
===================================================================
--- test/MC/X86/AlignedBundling/relax-in-bundle-group.s
+++ test/MC/X86/AlignedBundling/relax-in-bundle-group.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentium4 %s -o - \
 # RUN:   | llvm-objdump -disassemble - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentium4 -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble - | FileCheck %s
 
 # Test that instructions inside bundle-locked groups are relaxed even if their
Index: test/MC/X86/AlignedBundling/single-inst-bundling.s
===================================================================
--- test/MC/X86/AlignedBundling/single-inst-bundling.s
+++ test/MC/X86/AlignedBundling/single-inst-bundling.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentium4 %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck -check-prefix=CHECK -check-prefix=CHECK-OPT %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentium4 -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck -check-prefix=CHECK -check-prefix=CHECK-RELAX %s
 
 # Test simple NOP insertion for single instructions.
Index: test/MC/X86/arch-directive.s
===================================================================
--- /dev/null
+++ test/MC/X86/arch-directive.s
@@ -0,0 +1,42 @@
+# RUN: not llvm-mc -filetype=asm -triple x86_64-pc-linux-gnu %s 2>%t -o - | FileCheck %s
+# RUN: FileCheck < %t %s --check-prefix=CHECK-ERR
+
+# CHECK: .text
+.arch	core-avx2
+# CHECK: .arch core-avx2
+.arch	.avx512f
+# CHECK: .arch .avx512f
+.arch	.noavx2
+# CHECK: .arch .noavx2
+
+addq	$1, %rax
+# CHECK: addq $1, %rax
+
+.arch	i386
+# CHECK: .arch i386
+.arch	.noavx
+# CHECK: .arch .noavx
+.arch	.x87
+# CHECK: .arch .x87
+.arch	.no64bit-mode
+# CHECK: .arch .no64bit-mode
+.arch	.32bit-mode
+# CHECK: .arch .32bit-mode
+
+addq	$1, %rax
+# CHECK-NOT: addq $1, %rax
+# CHECK-ERR: error: register %rax is only available in 64-bit mode
+
+.arch	fake-cpu
+# CHECK-NOT: .arch fake-cpu
+# CHECK-ERR: error: unrecognized CPU 'fake-cpu' in .arch directive
+.arch	.fake-feature
+# CHECK-NOT: .arch .fake-feature
+# CHECK-ERR: error: unrecognized feature 'fake-feature' in .arch directive
+.arch	.nofake-feature
+# CHECK-NOT: .arch .nofake-feature
+# CHECK-ERR: error: unrecognized feature 'fake-feature' in .arch directive
+
+.arch	something unexpected
+# CHECK-NOT: .arch something unexpected
+# CHECK-ERR: error: unexpected number of arguments in .arch directive
Index: test/MC/X86/x86_long_nop.s
===================================================================
--- test/MC/X86/x86_long_nop.s
+++ test/MC/X86/x86_long_nop.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=pentium4 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=core-avx2 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 -mcpu=k8 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 -mcpu=barcelona %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=slm %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP7 %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=lakemont %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP7 %s
 
Index: test/MC/X86/x86_long_nop_arch.s
===================================================================
--- /dev/null
+++ test/MC/X86/x86_long_nop_arch.s
@@ -0,0 +1,44 @@
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+
+# Ensure .arch directive affects long nop emission.
+
+incl    %eax
+
+.arch   i386
+.p2align  4
+
+incl    %eax
+
+.arch   pentium4
+.p2align  4
+
+incl    %eax
+
+.arch   slm
+.p2align  4
+
+incl    %eax
+
+# CHECK:      0:    incl
+# CHECK-NEXT: 1:    nop
+# CHECK-NEXT: 2:    nop
+# CHECK-NEXT: 3:    nop
+# CHECK-NEXT: 4:    nop
+# CHECK-NEXT: 5:    nop
+# CHECK-NEXT: 6:    nop
+# CHECK-NEXT: 7:    nop
+# CHECK-NEXT: 8:    nop
+# CHECK-NEXT: 9:    nop
+# CHECK-NEXT: a:    nop
+# CHECK-NEXT: b:    nop
+# CHECK-NEXT: c:    nop
+# CHECK-NEXT: d:    nop
+# CHECK-NEXT: e:    nop
+# CHECK-NEXT: f:    nop
+# CHECK-NEXT: 10:   incl
+# CHECK-NEXT: 11:   nopw
+# CHECK-NEXT: 20:   incl
+# CHECK-NEXT: 21:   nopl
+# CHECK-NEXT: 28:   nopl
+# CHECK-NEXT: 2f:   nop
+# CHECK-NEXT: 30:   incl
Index: tools/llvm-mc/llvm-mc.cpp
===================================================================
--- tools/llvm-mc/llvm-mc.cpp
+++ tools/llvm-mc/llvm-mc.cpp
@@ -580,7 +580,7 @@
     auto FOut = llvm::make_unique<formatted_raw_ostream>(*OS);
     Str.reset(TheTarget->createAsmStreamer(
         Ctx, std::move(FOut), /*asmverbose*/ true,
-        /*useDwarfDirectory*/ true, IP, CE, MAB, ShowInst));
+        /*useDwarfDirectory*/ true, IP, CE, MAB, ShowInst, *STI));
 
   } else if (FileType == OFT_Null) {
     Str.reset(TheTarget->createNullStreamer(Ctx));