Index: include/llvm/Target/TargetInstrInfo.h
===================================================================
--- include/llvm/Target/TargetInstrInfo.h
+++ include/llvm/Target/TargetInstrInfo.h
@@ -817,6 +817,16 @@
   /// anything was changed.
   virtual bool expandPostRAPseudo(MachineInstr &MI) const { return false; }
 
+  /// Check whether the target can fold a load that feeds a subreg operand
+  /// (or a subreg operand that feeds a store).
+  /// For stores, LoadMI is always null. For loads, LoadMI is non-null if
+  /// we're trying to fold an existing load instruction, and null if we are
+  /// trying to fold a load
+  virtual bool isSubregFoldable(MachineOperand &MO,
+                                MachineInstr *LoadMI) const {
+    return false;
+  }
+
   /// Attempt to fold a load or store of the specified stack
   /// slot into the specified machine instruction for the specified operand(s).
   /// If this is possible, a new instruction is returned with the specified
Index: lib/CodeGen/InlineSpiller.cpp
===================================================================
--- lib/CodeGen/InlineSpiller.cpp
+++ lib/CodeGen/InlineSpiller.cpp
@@ -739,6 +739,7 @@
   bool WasCopy = MI->isCopy();
   unsigned ImpReg = 0;
 
+  // We always want to spill subregs for stackmap/patchpoint pseudos.
   bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::STATEPOINT ||
                        MI->getOpcode() == TargetOpcode::PATCHPOINT ||
                        MI->getOpcode() == TargetOpcode::STACKMAP);
@@ -754,8 +755,9 @@
       ImpReg = MO.getReg();
       continue;
     }
-    // FIXME: Teach targets to deal with subregs.
-    if (!SpillSubRegs && MO.getSubReg())
+
+    // Spill subregs if the target allows it.
+    if (!SpillSubRegs && MO.getSubReg() && !TII.isSubregFoldable(MO, LoadMI))
       return false;
     // We cannot fold a load instruction into a def.
     if (LoadMI && MO.isDef())
Index: lib/CodeGen/TargetInstrInfo.cpp
===================================================================
--- lib/CodeGen/TargetInstrInfo.cpp
+++ lib/CodeGen/TargetInstrInfo.cpp
@@ -529,6 +529,29 @@
     NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, LIS);
   }
 
+  // If we're not folding a load into a subreg, the size of the load is the
+  // size of the spill slot. But if we are, we need to figure out what the
+  // actual load size is.
+  int64_t MemSize = 0;
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  if (Flags & MachineMemOperand::MOStore) {
+    MemSize = MFI.getObjectSize(FI);
+  } else {
+    for (unsigned Idx : Ops) {
+      int64_t OpSize = MFI.getObjectSize(FI);
+
+      if (auto SubReg = MI.getOperand(Idx).getSubReg()) {
+        unsigned SubRegSize = TRI->getSubRegIdxSize(SubReg);
+        if (SubRegSize > 0 && !(SubRegSize % 8))
+          OpSize = SubRegSize / 8;
+      }
+
+      MemSize = std::max(MemSize, OpSize);
+    }
+  }
+
   if (NewMI) {
     NewMI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
     // Add a memory operand, foldMemoryOperandImpl doesn't do that.
@@ -538,10 +561,9 @@
     assert((!(Flags & MachineMemOperand::MOLoad) ||
             NewMI->mayLoad()) &&
            "Folded a use to a non-load!");
-    const MachineFrameInfo &MFI = MF.getFrameInfo();
     assert(MFI.getObjectOffset(FI) != -1);
     MachineMemOperand *MMO = MF.getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(MF, FI), Flags, MFI.getObjectSize(FI),
+        MachinePointerInfo::getFixedStack(MF, FI), Flags, MemSize,
         MFI.getObjectAlignment(FI));
     NewMI->addMemOperand(MF, MMO);
 
@@ -558,7 +580,6 @@
 
   const MachineOperand &MO = MI.getOperand(1 - Ops[0]);
   MachineBasicBlock::iterator Pos = MI;
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
 
   if (Flags == MachineMemOperand::MOStore)
     storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI);
Index: lib/Target/X86/X86InstrInfo.h
===================================================================
--- lib/Target/X86/X86InstrInfo.h
+++ lib/Target/X86/X86InstrInfo.h
@@ -378,6 +378,14 @@
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
+  /// Check whether the target can fold a load that feeds a subreg operand
+  /// (or a subreg operand that feeds a store).
+  /// For stores, LoadMI is always null. For loads, LoadMI is non-null if
+  /// we're trying to fold an existing load instruction, and null if we are
+  /// trying to fold a load
+  bool isSubregFoldable(MachineOperand &MO,
+                                MachineInstr *LoadMI) const override;
+
   /// foldMemoryOperand - If this target supports it, fold a load or store of
   /// the specified stack slot into the specified machine instruction for the
   /// specified operand(s).  If this is possible, the target should perform the
Index: lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- lib/Target/X86/X86InstrInfo.cpp
+++ lib/Target/X86/X86InstrInfo.cpp
@@ -6276,6 +6276,18 @@
   return nullptr;
 }
 
+bool X86InstrInfo::isSubregFoldable(MachineOperand &MO,
+                                    MachineInstr *LoadMI) const {
+  // We only support folding into loads from stack-slots.
+  if (LoadMI || MO.isDef())
+    return false;
+
+  // We don't want to try to fold into subregs that have a non-zero offset
+  // from the register start. Generally, this should be a TRI query, but
+  // we know there's only one such case on x86.
+  return MO.getSubReg() != X86::sub_8bit_hi;
+}
+
 MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
     ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
Index: test/CodeGen/X86/partial-fold.ll
===================================================================
--- test/CodeGen/X86/partial-fold.ll
+++ test/CodeGen/X86/partial-fold.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @fold64to32(i64 %add, i32 %spill) {
+; CHECK-LABEL: fold64to32:
+; CHECK:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK:    subl -{{[0-9]+}}(%rsp), %esi # 4-byte Folded Reload
+entry:
+  tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"()
+  %trunc = trunc i64 %add to i32
+  %sub = sub i32 %spill, %trunc
+  ret i32 %sub
+}
+
+define i8 @fold64to8(i64 %add, i8 %spill) {
+; CHECK-LABEL: fold64to8:
+; CHECK:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK:    subb -{{[0-9]+}}(%rsp), %sil # 1-byte Folded Reload
+entry:
+  tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"()
+  %trunc = trunc i64 %add to i8
+  %sub = sub i8 %spill, %trunc
+  ret i8 %sub
+}
Index: test/CodeGen/X86/vector-half-conversions.ll
===================================================================
--- test/CodeGen/X86/vector-half-conversions.ll
+++ test/CodeGen/X86/vector-half-conversions.ll
@@ -4788,9 +4788,8 @@
 ; AVX1-NEXT:    orl %ebx, %r14d
 ; AVX1-NEXT:    shlq $32, %r14
 ; AVX1-NEXT:    orq %r15, %r14
-; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = mem[1,0]
 ; AVX1-NEXT:    callq __truncdfhf2
 ; AVX1-NEXT:    movw %ax, %bx
 ; AVX1-NEXT:    shll $16, %ebx
@@ -4856,9 +4855,8 @@
 ; AVX2-NEXT:    orl %ebx, %r14d
 ; AVX2-NEXT:    shlq $32, %r14
 ; AVX2-NEXT:    orq %r15, %r14
-; AVX2-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
 ; AVX2-NEXT:    callq __truncdfhf2
 ; AVX2-NEXT:    movw %ax, %bx
 ; AVX2-NEXT:    shll $16, %ebx
@@ -5585,9 +5583,8 @@
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    callq __truncdfhf2
 ; AVX1-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
-; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = mem[1,0]
 ; AVX1-NEXT:    callq __truncdfhf2
 ; AVX1-NEXT:    movl %eax, %r12d
 ; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
@@ -5654,9 +5651,8 @@
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    callq __truncdfhf2
 ; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
-; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT:    # xmm0 = mem[1,0]
 ; AVX2-NEXT:    callq __truncdfhf2
 ; AVX2-NEXT:    movl %eax, %r12d
 ; AVX2-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload