diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -96,8 +96,8 @@
 };
 
 // Emit a minimal sequence of nops spanning NumBytes bytes.
-static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
-                     const MCSubtargetInfo &STI);
+static void emitX86Nops(MCStreamer &OS, unsigned NumBytes,
+                        const X86Subtarget *Subtarget);
 
 void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
                                                  const MCSubtargetInfo &STI,
@@ -117,8 +117,8 @@
     MCStreamer &OutStreamer, const MCSubtargetInfo &STI) {
   if (InShadow && CurrentShadowSize < RequiredShadowSize) {
     InShadow = false;
-    EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
-             MF->getSubtarget<X86Subtarget>().is64Bit(), STI);
+    emitX86Nops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
+                &MF->getSubtarget<X86Subtarget>());
   }
 }
 
@@ -1082,29 +1082,26 @@
 /// Return the longest nop which can be efficiently decoded for the given
 /// target cpu.  15-bytes is the longest single NOP instruction, but some
 /// platforms can't decode the longest forms efficiently.
-static unsigned MaxLongNopLength(const MCSubtargetInfo &STI) {
-  uint64_t MaxNopLength = 10;
-  if (STI.getFeatureBits()[X86::ProcIntelSLM])
-    MaxNopLength = 7;
-  else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
-    MaxNopLength = 15;
-  else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
-    MaxNopLength = 11;
-  return MaxNopLength;
+static unsigned maxLongNopLength(const X86Subtarget *Subtarget) {
+  if (Subtarget->getFeatureBits()[X86::ProcIntelSLM])
+    return 7;
+  if (Subtarget->getFeatureBits()[X86::FeatureFast15ByteNOP])
+    return 15;
+  if (Subtarget->getFeatureBits()[X86::FeatureFast11ByteNOP])
+    return 11;
+  if (Subtarget->getFeatureBits()[X86::FeatureNOPL] || Subtarget->is64Bit())
+    return 10;
+  if (Subtarget->is32Bit())
+    return 2;
+  return 1;
 }
 
 /// Emit the largest nop instruction smaller than or equal to \p NumBytes
 /// bytes.  Return the size of nop emitted.
-static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
-                        const MCSubtargetInfo &STI) {
-  if (!Is64Bit) {
-    // TODO Do additional checking if the CPU supports multi-byte nops.
-    OS.emitInstruction(MCInstBuilder(X86::NOOP), STI);
-    return 1;
-  }
-
+static unsigned emitNop(MCStreamer &OS, unsigned NumBytes,
+                        const X86Subtarget *Subtarget, bool LegacyNop = false) {
   // Cap a single nop emission at the profitable value for the target
-  NumBytes = std::min(NumBytes, MaxLongNopLength(STI));
+  NumBytes = std::min(NumBytes, maxLongNopLength(Subtarget));
 
   unsigned NopSize;
   unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
@@ -1121,7 +1118,7 @@
     break;
   case 2:
     NopSize = 2;
-    Opc = X86::XCHG16ar;
+    Opc = LegacyNop ? X86::MOV32rr_REV : X86::XCHG16ar;
     break;
   case 3:
     NopSize = 3;
@@ -1178,10 +1175,15 @@
   switch (Opc) {
   default: llvm_unreachable("Unexpected opcode");
   case X86::NOOP:
-    OS.emitInstruction(MCInstBuilder(Opc), STI);
+    OS.emitInstruction(MCInstBuilder(Opc), *Subtarget);
     break;
   case X86::XCHG16ar:
-    OS.emitInstruction(MCInstBuilder(Opc).addReg(X86::AX).addReg(X86::AX), STI);
+    OS.emitInstruction(MCInstBuilder(Opc).addReg(X86::AX).addReg(X86::AX),
+                       *Subtarget);
+    break;
+  case X86::MOV32rr_REV:
+    OS.emitInstruction(MCInstBuilder(Opc).addReg(X86::EDI).addReg(X86::EDI),
+                       *Subtarget);
     break;
   case X86::NOOPL:
   case X86::NOOPW:
@@ -1191,7 +1193,7 @@
                            .addReg(IndexReg)
                            .addImm(Displacement)
                            .addReg(SegmentReg),
-                       STI);
+                       *Subtarget);
     break;
   }
   assert(NopSize <= NumBytes && "We overemitted?");
@@ -1199,12 +1201,12 @@
 }
 
 /// Emit the optimal amount of multi-byte nops on X86.
-static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
-                     const MCSubtargetInfo &STI) {
+static void emitX86Nops(MCStreamer &OS, unsigned NumBytes,
+                        const X86Subtarget *Subtarget) {
   unsigned NopsToEmit = NumBytes;
   (void)NopsToEmit;
   while (NumBytes) {
-    NumBytes -= EmitNop(OS, NumBytes, Is64Bit, STI);
+    NumBytes -= emitNop(OS, NumBytes, Subtarget);
     assert(NopsToEmit >= NumBytes && "Emitted more than I asked for!");
   }
 }
@@ -1217,8 +1219,7 @@
 
   StatepointOpers SOpers(&MI);
   if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
-    EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(),
-             getSubtargetInfo());
+    emitX86Nops(*OutStreamer, PatchBytes, Subtarget);
   } else {
     // Lower call target and choose correct opcode
     const MachineOperand &CallTarget = SOpers.getCallTarget();
@@ -1350,8 +1351,15 @@
       // bytes too, so the check on MinSize is important.
       MCI.setOpcode(X86::PUSH64rmr);
     } else {
-      unsigned NopSize = EmitNop(*OutStreamer, MinSize, Subtarget->is64Bit(),
-                                 getSubtargetInfo());
+      // For compatibilty reasons, when targetting MSVC, is is important to
+      // generate a 'legacy' NOP in the form of a 8B FF MOV EDI, EDI. Some tools
+      // rely specifically on this pattern to be able to patch a function.
+      // This is only for 32-bit targets, when using /arch:IA32 or /arch:SSE.
+      bool LegacyNop =
+          Subtarget->is32Bit() && Subtarget->isTargetWindowsMSVC() &&
+          (Subtarget->getCPU().empty() || Subtarget->getCPU() == "pentium3");
+
+      unsigned NopSize = emitNop(*OutStreamer, MinSize, Subtarget, LegacyNop);
       assert(NopSize == MinSize && "Could not implement MinSize!");
       (void)NopSize;
     }
@@ -1435,8 +1443,7 @@
   assert(NumBytes >= EncodedBytes &&
          "Patchpoint can't request size less than the length of a call.");
 
-  EmitNops(*OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(),
-           getSubtargetInfo());
+  emitX86Nops(*OutStreamer, NumBytes - EncodedBytes, Subtarget);
 }
 
 void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
@@ -1496,7 +1503,7 @@
         EmitAndCountInstruction(
             MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I]));
       } else {
-        EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo());
+        emitX86Nops(*OutStreamer, 4, Subtarget);
       }
     }
 
@@ -1525,7 +1532,7 @@
     if (UsedMask[I])
       EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I]));
     else
-      EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo());
+      emitX86Nops(*OutStreamer, 1, Subtarget);
 
   OutStreamer->AddComment("xray custom event end.");
 
@@ -1594,7 +1601,7 @@
         EmitAndCountInstruction(
             MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I]));
       } else {
-        EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo());
+        emitX86Nops(*OutStreamer, 4, Subtarget);
       }
     }
 
@@ -1628,7 +1635,7 @@
     if (UsedMask[I])
       EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I]));
     else
-      EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo());
+      emitX86Nops(*OutStreamer, 1, Subtarget);
 
   OutStreamer->AddComment("xray typed event end.");
 
@@ -1648,7 +1655,7 @@
             .getValueAsString()
             .getAsInteger(10, Num))
       return;
-    EmitNops(*OutStreamer, Num, Subtarget->is64Bit(), getSubtargetInfo());
+    emitX86Nops(*OutStreamer, Num, Subtarget);
     return;
   }
   // We want to emit the following pattern:
@@ -1672,7 +1679,7 @@
   // an operand (computed as an offset from the jmp instruction).
   // FIXME: Find another less hacky way do force the relative jump.
   OutStreamer->emitBytes("\xeb\x09");
-  EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
+  emitX86Nops(*OutStreamer, 9, Subtarget);
   recordSled(CurSled, MI, SledKind::FUNCTION_ENTER, 2);
 }
 
@@ -1704,7 +1711,7 @@
     if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
       Ret.addOperand(MaybeOperand.getValue());
   OutStreamer->emitInstruction(Ret, getSubtargetInfo());
-  EmitNops(*OutStreamer, 10, Subtarget->is64Bit(), getSubtargetInfo());
+  emitX86Nops(*OutStreamer, 10, Subtarget);
   recordSled(CurSled, MI, SledKind::FUNCTION_EXIT, 2);
 }
 
@@ -1727,7 +1734,7 @@
   // an operand (computed as an offset from the jmp instruction).
   // FIXME: Find another less hacky way do force the relative jump.
   OutStreamer->emitBytes("\xeb\x09");
-  EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
+  emitX86Nops(*OutStreamer, 9, Subtarget);
   OutStreamer->emitLabel(Target);
   recordSled(CurSled, MI, SledKind::TAIL_CALL, 2);
 
diff --git a/llvm/test/CodeGen/X86/patchable-function-entry.ll b/llvm/test/CodeGen/X86/patchable-function-entry.ll
--- a/llvm/test/CodeGen/X86/patchable-function-entry.ll
+++ b/llvm/test/CodeGen/X86/patchable-function-entry.ll
@@ -31,7 +31,7 @@
 define void @f2() "patchable-function-entry"="2" {
 ; CHECK-LABEL: f2:
 ; CHECK-NEXT: .Lfunc_begin2:
-; 32-COUNT-2:  nop
+; 32:          xchgw %ax, %ax
 ; 64:          xchgw %ax, %ax
 ; CHECK-NEXT:  ret
 ; CHECK:       .section __patchable_function_entries,"awo",@progbits,f2{{$}}
@@ -46,7 +46,8 @@
 define void @f3() "patchable-function-entry"="3" comdat {
 ; CHECK-LABEL: f3:
 ; CHECK-NEXT: .Lfunc_begin3:
-; 32-COUNT-3:  nop
+; 32:          xchgw %ax, %ax
+; 32-NEXT:     nop
 ; 64:          nopl (%rax)
 ; CHECK:       ret
 ; CHECK:       .section __patchable_function_entries,"aGwo",@progbits,f3,comdat,f3{{$}}
@@ -61,7 +62,8 @@
 define void @f5() "patchable-function-entry"="5" comdat {
 ; CHECK-LABEL: f5:
 ; CHECK-NEXT: .Lfunc_begin4:
-; 32-COUNT-5:  nop
+; 32-COUNT-2:  xchgw %ax, %ax
+; 32-NEXT:     nop
 ; 64:          nopl 8(%rax,%rax)
 ; CHECK-NEXT:  ret
 ; CHECK:       .section __patchable_function_entries,"aGwo",@progbits,f5,comdat,f5{{$}}
diff --git a/llvm/test/CodeGen/X86/patchable-prologue.ll b/llvm/test/CodeGen/X86/patchable-prologue.ll
--- a/llvm/test/CodeGen/X86/patchable-prologue.ll
+++ b/llvm/test/CodeGen/X86/patchable-prologue.ll
@@ -1,5 +1,14 @@
 ; RUN: llc -verify-machineinstrs -filetype=obj -o - -mtriple=x86_64-apple-macosx < %s | llvm-objdump --triple=x86_64-apple-macosx -d - | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=x86_64-apple-macosx < %s | FileCheck %s --check-prefix=CHECK-ALIGN
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386 < %s | FileCheck %s --check-prefixes=32,32CFI,XCHG
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc < %s | FileCheck %s --check-prefixes=32,MOV
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc -mcpu=pentium3 < %s | FileCheck %s --check-prefixes=32,MOV
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc -mcpu=pentium4 < %s | FileCheck %s --check-prefixes=32,XCHG
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=64
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-unknown-linux-code16 < %s | FileCheck %s --check-prefix=16
+
+; 16-NOT: movl   %edi, %edi
+; 16-NOT: xchgw   %ax, %ax
 
 declare void @callee(i64*)
 
@@ -10,6 +19,18 @@
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f0:
 
+; 32: f0:
+; 32CFI-NEXT: .cfi_startproc
+; 32-NEXT: # %bb.0:
+; XCHG-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
+; MOV-NEXT: movl    %edi, %edi              # encoding: [0x8b,0xff]
+; 32-NEXT: retl
+
+; 64: f0:
+; 64-NEXT: # %bb.0:
+; 64-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
+; 64-NEXT: retq
+		
   ret void
 }
 
@@ -19,6 +40,19 @@
 
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f1:
+
+; 32: f1:
+; 32CFI-NEXT: .cfi_startproc
+; 32-NEXT: # %bb.0:
+; XCHG-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
+; MOV-NEXT: movl    %edi, %edi              # encoding: [0x8b,0xff]
+; 32-NEXT: pushl   %ebp
+
+; 64: f1:
+; 64-NEXT: .seh_proc f1
+; 64-NEXT: # %bb.0:
+; 64-NEXT: pushq   %rbp
+		
   ret void
 }
 
@@ -28,6 +62,19 @@
 
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f2:
+
+; 32: f2:
+; 32CFI-NEXT: .cfi_startproc
+; 32-NEXT: # %bb.0:
+; XCHG-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
+; MOV-NEXT: movl    %edi, %edi              # encoding: [0x8b,0xff]
+; 32-NEXT: pushl   %ebp
+
+; 64: f2:
+; 64-NEXT: .seh_proc f2
+; 64-NEXT: # %bb.0:
+; 64-NEXT: subq    $200, %rsp
+		
   %ptr = alloca i64, i32 20
   call void @callee(i64* %ptr)
   ret void
@@ -39,6 +86,19 @@
 
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f3:
+
+; 32: f3:
+; 32CFI-NEXT: .cfi_startproc
+; 32-NEXT: # %bb.0:
+; XCHG-NEXT: xchgw   %ax, %ax
+; MOV-NEXT: movl   %edi, %edi
+; 32-NEXT: retl
+
+; 64: f3:
+; 64-NEXT: # %bb.0:
+; 64-NEXT: xchgw   %ax, %ax
+; 64-NEXT: retq
+
   ret void
 }
 
@@ -47,6 +107,17 @@
 ; patchable one.
 ; CHECK-LABEL: f4{{>?}}:
 ; CHECK-NEXT: 8b 0c 37  movl  (%rdi,%rsi), %ecx
+; 32: f4:
+; 32CFI-NEXT: .cfi_startproc
+; 32-NEXT: # %bb.0:
+; XCHG-NEXT: xchgw   %ax, %ax
+; MOV-NEXT: movl   %edi, %edi
+; 32-NEXT: pushl   %ebx
+
+; 64: f4:
+; 64-NEXT: # %bb.0:
+; 64-NOT: xchgw   %ax, %ax
+
 define i32 @f4(i8* %arg1, i64 %arg2, i32 %arg3) "patchable-function"="prologue-short-redirect" {
 bb:
   %tmp10 = getelementptr i8, i8* %arg1, i64 %arg2