diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2050,6 +2050,20 @@
     llvm_unreachable("Target does not implement this");
   }
 
+  /// If insert/extracts of size \p MemBits require a new register, return the
+  /// register class.
+  virtual const TargetRegisterClass *
+  spill2RegInsertOrExtractRequiresNewReg(unsigned MemBits,
+                                         const TargetRegisterInfo *TRI) const {
+    llvm_unreachable("Target does not implement this");
+  }
+
+  /// \Returns the subreg index for converting \p FromBits to \p ToBits.
+  virtual unsigned spill2RegGetSubregIdx(unsigned FromBits, unsigned ToBits,
+                                         const TargetRegisterInfo *TRI) const {
+    llvm_unreachable("Target does not implement this");
+  }
+
 private:
   mutable std::unique_ptr<MIRFormatter> Formatter;
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
diff --git a/llvm/lib/CodeGen/Spill2Reg.cpp b/llvm/lib/CodeGen/Spill2Reg.cpp
--- a/llvm/lib/CodeGen/Spill2Reg.cpp
+++ b/llvm/lib/CodeGen/Spill2Reg.cpp
@@ -112,13 +112,22 @@
                         const LiveRegUnits &LRU);
   /// Helper for generateCode(). It eplaces stack spills or reloads with movs
   /// to \p LI.reg().
-  void replaceStackWithReg(StackSlotDataEntry &Entry, Register VectorReg);
+  void replaceStackWithReg(StackSlotDataEntry &Entry, Register VectorReg,
+                           int StackSlot);
   /// Updates the live-ins of MBBs after we emit the new spill2reg instructions
   /// and the vector registers become live from register spills to reloads.
   void updateLiveIns(StackSlotDataEntry &Entry, MCRegister VectorReg);
   /// Updates \p LRU with the liveness of physical registers around the spills
   /// and reloads in \p Entry.
   void calculateLiveRegs(StackSlotDataEntry &Entry, LiveRegUnits &LRU);
+  /// Replaces all occurences of \p OldReg with \p NewReg and sets the subreg
+  /// accordingly. Helper for replaceStackWithReg().
+  void replaceRegAndSubreg(Register OldReg, Register NewReg,
+                           const MachineInstr *S2RMI, unsigned NewSubregIdx);
+  /// If the insert/extracts touch wider registers than \p MemBits, then \p
+  /// OldReg requires a subreg. This is taken care of in this method.
+  void replaceWithSubregIfRequired(uint32_t MemBits, Register OldReg,
+                                   MachineInstr *MI, int StackSlot);
   /// Replace spills to stack with spills to registers (same for reloads).
   void generateCode();
   /// Cleanup data structures once the pass is finished.
@@ -179,14 +188,22 @@
 
   /// The checks for collecting spills and reloads are identical, so we keep
   /// them here in one place. Return true if we should not collect this.
-  auto SkipEntry = [this](int StackSlot, unsigned MemBits,
-                          unsigned Opcode) -> bool {
+  auto SkipEntry = [this](int StackSlot, unsigned MemBits, unsigned Opcode,
+                          const MachineOperand *MO) -> bool {
     // If not a spill/reload stack slot.
     if (!MFI->isSpillSlotObjectIndex(StackSlot))
       return true;
     // Check size in bits.
     if (!TII->isLegalToSpill2Reg(MemBits, Opcode))
       return true;
+    // If we need a subregister, then we need more checks:
+    if (TII->spill2RegInsertOrExtractRequiresNewReg(MemBits, TRI) &&
+        llvm::any_of(MRI->reg_operands(MO->getReg()), [](const auto &MO) {
+          // Skip if any MO is non renamable or if MO's parent instr is a pseudo
+          return !MO.isRenamable() || MO.getParent()->isPseudo();
+        }))
+      return true;
+
     return false;
   };
 
@@ -209,7 +226,7 @@
         MachineInstr *Spill = &MI;
         auto &Entry = StackSlotData[StackSlot];
         unsigned MemBits = TRI->getRegSizeInBits(MO->getReg(), *MRI);
-        if (SkipEntry(StackSlot, MemBits, MI.getOpcode())) {
+        if (SkipEntry(StackSlot, MemBits, MI.getOpcode(), MO)) {
           Entry.Disable = true;
           continue;
         }
@@ -227,7 +244,7 @@
         MachineInstr *Reload = &MI;
         auto &Entry = StackSlotData[StackSlot];
         unsigned MemBits = TRI->getRegSizeInBits(MO->getReg(), *MRI);
-        if (SkipEntry(StackSlot, MemBits, MI.getOpcode())) {
+        if (SkipEntry(StackSlot, MemBits, MI.getOpcode(), MO)) {
           Entry.Disable = true;
           continue;
         }
@@ -332,9 +349,54 @@
   }
 }
 
+void Spill2Reg::replaceRegAndSubreg(Register OldReg, Register NewReg,
+                                    const MachineInstr *S2RMI,
+                                    unsigned NewSubregIdx) {
+  for (MachineOperand &MO :
+       llvm::make_early_inc_range(MRI->reg_operands(OldReg))) {
+    const MachineInstr *ModifiedI = MO.getParent();
+    assert((MO.isRenamable() || ModifiedI == S2RMI) &&
+           "Should have been discarded earlier");
+    MO.setReg(NewReg);
+    MO.setIsRenamable(true);
+    // The instr emitted by Spill2Reg does not need a subreg, so skip.
+    if (ModifiedI == S2RMI)
+      continue;
+    // Unless we mark a def as 'undef' the rest of subregister will be
+    // considered as being read, which creates a use before a def.
+    if (MO.isDef())
+      MO.setIsUndef();
+    MO.setSubReg(NewSubregIdx);
+  }
+}
+
+void Spill2Reg::replaceWithSubregIfRequired(uint32_t MemBits, Register OldReg,
+                                            MachineInstr *MI, int StackSlot) {
+  const TargetRegisterClass *NewRegClass =
+      TII->spill2RegInsertOrExtractRequiresNewReg(MemBits, TRI);
+  // Early return if the target does not support this feature.
+  if (NewRegClass == nullptr)
+    return;
+
+  // If multiple reloads are writing to the same OldReg, then a previous
+  // invocation of this function will have already replaced the register with
+  // the new one. In that case do nothing.
+  uint32_t OldRegBits = TRI->getRegSizeInBits(OldReg, *MRI);
+  uint32_t NewRegBits = TRI->getRegSizeInBits(*NewRegClass);
+  if (NewRegBits == OldRegBits)
+    return;
+
+  // Get the new register and replace instances of OldReg with NewReg.
+  unsigned NewSubregIdx =
+      TII->spill2RegGetSubregIdx(NewRegBits, OldRegBits, TRI);
+  MCRegister NewReg =
+      TRI->getMatchingSuperReg(OldReg.asMCReg(), NewSubregIdx, NewRegClass);
+  replaceRegAndSubreg(OldReg, NewReg, MI, NewSubregIdx);
+}
+
 // Replace stack-based spills/reloads with register-based ones.
 void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
-                                    Register VectorReg) {
+                                    Register VectorReg, int StackSlot) {
   for (StackSlotDataEntry::MIData &SpillData : Entry.Spills) {
     MachineInstr *StackSpill = SpillData.MI;
     assert(SpillData.MO->isReg() && "Expected register MO");
@@ -349,6 +411,9 @@
 
     // Spill to stack is no longer needed.
     StackSpill->eraseFromParent();
+    // Set subregister if required.
+    replaceWithSubregIfRequired(SpillData.MemBits, OldReg, SpillToVector,
+                                StackSlot);
     assert(OldReg.isPhysical() && "Otherwise we need to removeInterval()");
   }
 
@@ -366,6 +431,8 @@
 
     // Reload from stack is no longer needed.
     StackReload->eraseFromParent();
+    replaceWithSubregIfRequired(ReloadData.MemBits, OldReg, ReloadFromReg,
+                                StackSlot);
     assert(OldReg.isPhysical() && "Otherwise we need to removeInterval()");
   }
 }
@@ -482,7 +549,8 @@
     updateLiveIns(Entry, *PhysVectorRegOpt);
 
     // Replace stack accesses with register accesses.
-    replaceStackWithReg(Entry, *PhysVectorRegOpt);
+    int StackSlot = Pair.first;
+    replaceStackWithReg(Entry, *PhysVectorRegOpt, StackSlot);
 
     NumSpill2RegInstrs += Entry.Spills.size() + Entry.Reloads.size();
   }
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -684,8 +684,13 @@
                                 int OperationBits, MachineBasicBlock *InsertMBB,
                                 MachineBasicBlock::iterator InsertBeforeIt,
                                 const TargetRegisterInfo *TRI) const override;
-};
 
+  const TargetRegisterClass *spill2RegInsertOrExtractRequiresNewReg(
+      unsigned MemBytes, const TargetRegisterInfo *TRI) const override;
+
+  unsigned spill2RegGetSubregIdx(unsigned FromBits, unsigned ToBits,
+                                 const TargetRegisterInfo *TRI) const override;
+};
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -9517,6 +9517,8 @@
   switch (MemBits) {
   case 64:
   case 32:
+  case 16:
+  case 8:
     return true;
   }
   return false;
@@ -9579,6 +9581,9 @@
 
 static unsigned getInsertOrExtractOpcode(unsigned Bits, bool Insert) {
   switch (Bits) {
+  case 8:
+  case 16:
+    return Insert ? X86::MOVDI2PDIrr : X86::MOVPDI2DIrr;
   case 32:
     return Insert ? X86::MOVDI2PDIrr : X86::MOVPDI2DIrr;
   case 64:
@@ -9617,5 +9622,35 @@
   return ExtractMI;
 }
 
+const TargetRegisterClass *X86InstrInfo::spill2RegInsertOrExtractRequiresNewReg(
+    unsigned MemBits, const TargetRegisterInfo *TRI) const {
+  switch (MemBits) {
+  case 8:
+  case 16:
+    return TRI->getRegClass(X86::GR32RegClassID);
+  default:
+    return nullptr;
+  }
+}
+
+unsigned
+X86InstrInfo::spill2RegGetSubregIdx(unsigned FromBits, unsigned ToBits,
+                                    const TargetRegisterInfo *TRI) const {
+  if (FromBits == ToBits)
+    return 0;
+
+  assert(FromBits > ToBits && "From expected to cover To");
+  switch (ToBits) {
+  case 32:
+    return X86::sub_32bit;
+  case 16:
+    return X86::sub_16bit;
+  case 8:
+    return X86::sub_8bit;
+  default:
+    llvm_unreachable("FIXME");
+  }
+}
+
 #define GET_INSTRINFO_HELPERS
 #include "X86GenInstrInfo.inc"
diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll
--- a/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll
+++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll
@@ -67,8 +67,8 @@
 ; CHECK-NEXT:    .cfi_offset %r14, -32
 ; CHECK-NEXT:    .cfi_offset %r15, -24
 ; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movzwl D0(%rip), %eax
-; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movw D0(%rip), %eax
+; CHECK-NEXT:    movd %eax, %xmm3
 ; CHECK-NEXT:    movzwl D1(%rip), %ecx
 ; CHECK-NEXT:    movzwl D2(%rip), %edx
 ; CHECK-NEXT:    movzwl D3(%rip), %esi
@@ -83,18 +83,18 @@
 ; CHECK-NEXT:    movzwl D12(%rip), %r15d
 ; CHECK-NEXT:    movzwl D13(%rip), %r12d
 ; CHECK-NEXT:    movzwl D14(%rip), %r13d
-; CHECK-NEXT:    movzwl D15(%rip), %eax
-; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; CHECK-NEXT:    movzwl D16(%rip), %eax
-; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; CHECK-NEXT:    movzwl D17(%rip), %eax
-; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; CHECK-NEXT:    movzwl D18(%rip), %eax
-; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movw D15(%rip), %eax
+; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    movw D16(%rip), %eax
+; CHECK-NEXT:    movd %eax, %xmm1
+; CHECK-NEXT:    movw D17(%rip), %eax
+; CHECK-NEXT:    movd %eax, %xmm4
+; CHECK-NEXT:    movw D18(%rip), %eax
+; CHECK-NEXT:    movd %eax, %xmm2
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
-; CHECK-NEXT:    movw %ax, U0(%rip)
+; CHECK-NEXT:    movd %xmm3, %eax
+; CHECK-NEXT:    movw %eax, U0(%rip)
 ; CHECK-NEXT:    movw %cx, U1(%rip)
 ; CHECK-NEXT:    movw %dx, U2(%rip)
 ; CHECK-NEXT:    movw %si, U3(%rip)
@@ -109,14 +109,14 @@
 ; CHECK-NEXT:    movw %r15w, U12(%rip)
 ; CHECK-NEXT:    movw %r12w, U13(%rip)
 ; CHECK-NEXT:    movw %r13w, U14(%rip)
-; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
-; CHECK-NEXT:    movw %ax, U15(%rip)
-; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
-; CHECK-NEXT:    movw %ax, U16(%rip)
-; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
-; CHECK-NEXT:    movw %ax, U17(%rip)
-; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
-; CHECK-NEXT:    movw %ax, U18(%rip)
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    movw %eax, U15(%rip)
+; CHECK-NEXT:    movd %xmm1, %eax
+; CHECK-NEXT:    movw %eax, U16(%rip)
+; CHECK-NEXT:    movd %xmm4, %eax
+; CHECK-NEXT:    movw %eax, U17(%rip)
+; CHECK-NEXT:    movd %xmm2, %eax
+; CHECK-NEXT:    movw %eax, U18(%rip)
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    popq %r12
diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll
--- a/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll
+++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll
@@ -67,8 +67,8 @@
 ; CHECK-NEXT:    .cfi_offset %r14, -32
 ; CHECK-NEXT:    .cfi_offset %r15, -24
 ; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movb D0(%rip), %al
-; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb D0(%rip), %eax
+; CHECK-NEXT:    movd %eax, %xmm3
 ; CHECK-NEXT:    movb D1(%rip), %cl
 ; CHECK-NEXT:    movb D2(%rip), %dl
 ; CHECK-NEXT:    movb D3(%rip), %sil
@@ -83,18 +83,18 @@
 ; CHECK-NEXT:    movb D12(%rip), %r15b
 ; CHECK-NEXT:    movb D13(%rip), %r12b
 ; CHECK-NEXT:    movb D14(%rip), %r13b
-; CHECK-NEXT:    movb D15(%rip), %al
-; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-NEXT:    movb D16(%rip), %al
-; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-NEXT:    movb D17(%rip), %al
-; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-NEXT:    movb D18(%rip), %al
-; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb D15(%rip), %eax
+; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    movb D16(%rip), %eax
+; CHECK-NEXT:    movd %eax, %xmm1
+; CHECK-NEXT:    movb D17(%rip), %eax
+; CHECK-NEXT:    movd %eax, %xmm4
+; CHECK-NEXT:    movb D18(%rip), %eax
+; CHECK-NEXT:    movd %eax, %xmm2
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
-; CHECK-NEXT:    movb %al, U0(%rip)
+; CHECK-NEXT:    movd %xmm3, %eax
+; CHECK-NEXT:    movb %eax, U0(%rip)
 ; CHECK-NEXT:    movb %cl, U1(%rip)
 ; CHECK-NEXT:    movb %dl, U2(%rip)
 ; CHECK-NEXT:    movb %sil, U3(%rip)
@@ -109,14 +109,14 @@
 ; CHECK-NEXT:    movb %r15b, U12(%rip)
 ; CHECK-NEXT:    movb %r12b, U13(%rip)
 ; CHECK-NEXT:    movb %r13b, U14(%rip)
-; CHECK-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
-; CHECK-NEXT:    movb %al, U15(%rip)
-; CHECK-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
-; CHECK-NEXT:    movb %al, U16(%rip)
-; CHECK-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
-; CHECK-NEXT:    movb %al, U17(%rip)
-; CHECK-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
-; CHECK-NEXT:    movb %al, U18(%rip)
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    movb %eax, U15(%rip)
+; CHECK-NEXT:    movd %xmm1, %eax
+; CHECK-NEXT:    movb %eax, U16(%rip)
+; CHECK-NEXT:    movd %xmm4, %eax
+; CHECK-NEXT:    movb %eax, U17(%rip)
+; CHECK-NEXT:    movd %xmm2, %eax
+; CHECK-NEXT:    movb %eax, U18(%rip)
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    popq %r12