Index: include/llvm/Target/TargetInstrInfo.h =================================================================== --- include/llvm/Target/TargetInstrInfo.h +++ include/llvm/Target/TargetInstrInfo.h @@ -817,6 +817,16 @@ /// anything was changed. virtual bool expandPostRAPseudo(MachineInstr &MI) const { return false; } + /// Check whether the target can fold a load that feeds a subreg operand + /// (or a subreg operand that feeds a store). + /// For stores, LoadMI is always null. For loads, LoadMI is non-null if + /// we're trying to fold an existing load instruction, and null if we are + /// trying to fold a load + virtual bool isSubregFoldable(MachineOperand &MO, + MachineInstr *LoadMI) const { + return false; + } + /// Attempt to fold a load or store of the specified stack /// slot into the specified machine instruction for the specified operand(s). /// If this is possible, a new instruction is returned with the specified Index: lib/CodeGen/InlineSpiller.cpp =================================================================== --- lib/CodeGen/InlineSpiller.cpp +++ lib/CodeGen/InlineSpiller.cpp @@ -739,6 +739,7 @@ bool WasCopy = MI->isCopy(); unsigned ImpReg = 0; + // We always want to spill subregs for stackmap/patchpoint pseudos. bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::STATEPOINT || MI->getOpcode() == TargetOpcode::PATCHPOINT || MI->getOpcode() == TargetOpcode::STACKMAP); @@ -754,8 +755,9 @@ ImpReg = MO.getReg(); continue; } - // FIXME: Teach targets to deal with subregs. - if (!SpillSubRegs && MO.getSubReg()) + + // Spill subregs if the target allows it. + if (!SpillSubRegs && MO.getSubReg() && !TII.isSubregFoldable(MO, LoadMI)) return false; // We cannot fold a load instruction into a def. if (LoadMI && MO.isDef()) Index: lib/CodeGen/TargetInstrInfo.cpp =================================================================== --- lib/CodeGen/TargetInstrInfo.cpp +++ lib/CodeGen/TargetInstrInfo.cpp @@ -529,6 +529,29 @@ NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, LIS); } + // If we're not folding a load into a subreg, the size of the load is the + // size of the spill slot. But if we are, we need to figure out what the + // actual load size is. + int64_t MemSize = 0; + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + + if (Flags & MachineMemOperand::MOStore) { + MemSize = MFI.getObjectSize(FI); + } else { + for (unsigned Idx : Ops) { + int64_t OpSize = MFI.getObjectSize(FI); + + if (auto SubReg = MI.getOperand(Idx).getSubReg()) { + unsigned SubRegSize = TRI->getSubRegIdxSize(SubReg); + if (SubRegSize > 0 && !(SubRegSize % 8)) + OpSize = SubRegSize / 8; + } + + MemSize = std::max(MemSize, OpSize); + } + } + if (NewMI) { NewMI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); // Add a memory operand, foldMemoryOperandImpl doesn't do that. @@ -538,10 +561,9 @@ assert((!(Flags & MachineMemOperand::MOLoad) || NewMI->mayLoad()) && "Folded a use to a non-load!"); - const MachineFrameInfo &MFI = MF.getFrameInfo(); assert(MFI.getObjectOffset(FI) != -1); MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo::getFixedStack(MF, FI), Flags, MFI.getObjectSize(FI), + MachinePointerInfo::getFixedStack(MF, FI), Flags, MemSize, MFI.getObjectAlignment(FI)); NewMI->addMemOperand(MF, MMO); @@ -558,7 +580,6 @@ const MachineOperand &MO = MI.getOperand(1 - Ops[0]); MachineBasicBlock::iterator Pos = MI; - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (Flags == MachineMemOperand::MOStore) storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI); Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -378,6 +378,14 @@ bool expandPostRAPseudo(MachineInstr &MI) const override; + /// Check whether the target can fold a load that feeds a subreg operand + /// (or a subreg operand that feeds a store). + /// For stores, LoadMI is always null. For loads, LoadMI is non-null if + /// we're trying to fold an existing load instruction, and null if we are + /// trying to fold a load + bool isSubregFoldable(MachineOperand &MO, + MachineInstr *LoadMI) const override; + /// foldMemoryOperand - If this target supports it, fold a load or store of /// the specified stack slot into the specified machine instruction for the /// specified operand(s). If this is possible, the target should perform the Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -6276,6 +6276,18 @@ return nullptr; } +bool X86InstrInfo::isSubregFoldable(MachineOperand &MO, + MachineInstr *LoadMI) const { + // We only support folding into loads from stack-slots. + if (LoadMI || MO.isDef()) + return false; + + // We don't want to try to fold into subregs that have a non-zero offset + // from the register start. Generally, this should be a TRI query, but + // we know there's only one such case on x86. + return MO.getSubReg() != X86::sub_8bit_hi; +} + MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, unsigned OpNum, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, Index: test/CodeGen/X86/partial-fold.ll =================================================================== --- test/CodeGen/X86/partial-fold.ll +++ test/CodeGen/X86/partial-fold.ll @@ -0,0 +1,23 @@ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @fold64to32(i64 %add, i32 %spill) { +; CHECK-LABEL: fold64to32: +; CHECK: movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK: subl -{{[0-9]+}}(%rsp), %esi # 4-byte Folded Reload +entry: + tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"() + %trunc = trunc i64 %add to i32 + %sub = sub i32 %spill, %trunc + ret i32 %sub +} + +define i8 @fold64to8(i64 %add, i8 %spill) { +; CHECK-LABEL: fold64to8: +; CHECK: movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK: subb -{{[0-9]+}}(%rsp), %sil # 1-byte Folded Reload +entry: + tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"() + %trunc = trunc i64 %add to i8 + %sub = sub i8 %spill, %trunc + ret i8 %sub +} Index: test/CodeGen/X86/vector-half-conversions.ll =================================================================== --- test/CodeGen/X86/vector-half-conversions.ll +++ test/CodeGen/X86/vector-half-conversions.ll @@ -4788,9 +4788,8 @@ ; AVX1-NEXT: orl %ebx, %r14d ; AVX1-NEXT: shlq $32, %r14 ; AVX1-NEXT: orq %r15, %r14 -; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movw %ax, %bx ; AVX1-NEXT: shll $16, %ebx @@ -4856,9 +4855,8 @@ ; AVX2-NEXT: orl %ebx, %r14d ; AVX2-NEXT: shlq $32, %r14 ; AVX2-NEXT: orq %r15, %r14 -; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movw %ax, %bx ; AVX2-NEXT: shll $16, %ebx @@ -5585,9 +5583,8 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %r12d ; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload @@ -5654,9 +5651,8 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %r12d ; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload