Index: lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.h +++ lib/Target/AArch64/AArch64InstrInfo.h @@ -162,6 +162,10 @@ int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; + // This tells target independent code that it is okay to pass instructions + // with subreg operands to foldMemoryOperandImpl. + bool isSubregFoldable() const override { return true; } + using TargetInstrInfo::foldMemoryOperandImpl; MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2583,7 +2583,7 @@ // // // - if (MI.isCopy()) { + if (MI.isFullCopy()) { unsigned DstReg = MI.getOperand(0).getReg(); unsigned SrcReg = MI.getOperand(1).getReg(); if (SrcReg == AArch64::SP && @@ -2598,7 +2598,7 @@ } } - // Handle the case where a copy is being spilled or refilled but the source + // Handle the case where a copy is being spilled or filled but the source // and destination register class don't match. For example: // // %vreg0 = COPY %XZR; GPR64common:%vreg0 @@ -2613,7 +2613,7 @@ // // %vreg0 = COPY %vreg1; GPR64:%vreg0, FPR64:%vreg1 // - // will be refilled as + // will be filled as // // LDRDui %vreg0, fi<#0> // @@ -2622,9 +2622,11 @@ // LDRXui %vregTemp, fi<#0> // %vreg0 = FMOV %vregTemp // - if (MI.isFullCopy() && Ops.size() == 1 && + if (MI.isCopy() && Ops.size() == 1 && // Make sure we're only folding the explicit COPY defs/uses. (Ops[0] == 0 || Ops[0] == 1)) { + bool IsSpill = Ops[0] == 0; + bool IsFill = !IsSpill; const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const MachineRegisterInfo &MRI = MF.getRegInfo(); MachineBasicBlock &MBB = *MI.getParent(); @@ -2632,21 +2634,112 @@ const MachineOperand &SrcMO = MI.getOperand(1); unsigned DstReg = DstMO.getReg(); unsigned SrcReg = SrcMO.getReg(); + // This is slightly expensive to compute for physical regs since + // getMinimalPhysRegClass is slow. auto getRegClass = [&](unsigned Reg) { return TargetRegisterInfo::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) : TRI.getMinimalPhysRegClass(Reg); }; - const TargetRegisterClass &DstRC = *getRegClass(DstReg); - const TargetRegisterClass &SrcRC = *getRegClass(SrcReg); - if (DstRC.getSize() == SrcRC.getSize()) { - if (Ops[0] == 0) + + if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { + assert(getRegClass(DstReg)->getSize() == getRegClass(SrcReg)->getSize() && + "Mismatched register size in non subreg COPY"); + if (IsSpill) storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, - &SrcRC, &TRI); + getRegClass(SrcReg), &TRI); else - loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, &DstRC, &TRI); + loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, + getRegClass(DstReg), &TRI); return &*--InsertPt; } + + // Handle cases like spilling def of: + // + // %vreg0:sub_32 = COPY %WZR; GPR64common:%vreg0 + // + // where the physical register source can be widened and stored to the full + // virtual reg destination stack slot, in this case producing: + // + // STRXui %XZR, + // + if (IsSpill && DstMO.isUndef() && + TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + assert(SrcMO.getSubReg() == 0 && + "Unexpected subreg on physical register"); + const TargetRegisterClass *SpillRC; + unsigned SpillSubreg; + switch (DstMO.getSubReg()) { + default: + SpillRC = nullptr; + break; + case AArch64::sub_32: + case AArch64::ssub: + if (AArch64::GPR32RegClass.contains(SrcReg)) { + SpillRC = &AArch64::GPR64RegClass; + SpillSubreg = AArch64::sub_32; + } else if (AArch64::FPR32RegClass.contains(SrcReg)) { + SpillRC = &AArch64::FPR64RegClass; + SpillSubreg = AArch64::ssub; + } else + SpillRC = nullptr; + break; + case AArch64::dsub: + if (AArch64::FPR64RegClass.contains(SrcReg)) { + SpillRC = &AArch64::FPR128RegClass; + SpillSubreg = AArch64::dsub; + } else + SpillRC = nullptr; + break; + } + + if (SpillRC) + if (unsigned WidenedSrcReg = + TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) { + storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(), + FrameIndex, SpillRC, &TRI); + return &*--InsertPt; + } + } + + // Handle cases like filling use of: + // + // %vreg0:sub_32 = COPY %vreg1; GPR64:%vreg0, GPR32:%vreg1 + // + // where we can load the full virtual reg source stack slot, into the subreg + // destination, in this case producing: + // + // LDRWui %vreg0:sub_32, + // + if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { + const TargetRegisterClass *FillRC; + switch (DstMO.getSubReg()) { + default: + FillRC = nullptr; + break; + case AArch64::sub_32: + FillRC = &AArch64::GPR32RegClass; + break; + case AArch64::ssub: + FillRC = &AArch64::FPR32RegClass; + break; + case AArch64::dsub: + FillRC = &AArch64::FPR64RegClass; + break; + } + + if (FillRC) { + assert(getRegClass(SrcReg)->getSize() == FillRC->getSize() && + "Mismatched regclass size on folded subreg COPY"); + loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI); + MachineInstr &LoadMI = *--InsertPt; + MachineOperand &LoadDst = LoadMI.getOperand(0); + assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); + LoadDst.setSubReg(DstMO.getSubReg()); + LoadDst.setIsUndef(); + return &LoadMI; + } + } } // Cannot fold. Index: test/CodeGen/MIR/AArch64/spill-fold.mir =================================================================== --- /dev/null +++ test/CodeGen/MIR/AArch64/spill-fold.mir @@ -0,0 +1,82 @@ +# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass greedy -verify-machineinstrs -o - %s | FileCheck %s +--- | + define i64 @test_subreg_spill_fold() { ret i64 0 } + define i64 @test_subreg_spill_fold2() { ret i64 0 } + define i64 @test_subreg_spill_fold3() { ret i64 0 } + define i64 @test_subreg_fill_fold() { ret i64 0 } + define double @test_subreg_fill_fold2() { ret double 0.0 } +... +--- +# CHECK-LABEL: name: test_subreg_spill_fold +# Ensure that the spilled subreg COPY is eliminated and folded into the spill store. +name: test_subreg_spill_fold +registers: + - { id: 0, class: gpr64 } +body: | + bb.0: + ; CHECK: STRXui %xzr, %stack.0, 0 :: (store 8 into %stack.0) + undef %0.sub_32 = COPY %wzr + INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp + %x0 = COPY %0 + RET_ReallyLR implicit %x0 +... +--- +# CHECK-LABEL: name: test_subreg_spill_fold2 +# Similar to test_subreg_spill_fold, but with a vreg0 register class not containing %WZR. +name: test_subreg_spill_fold2 +registers: + - { id: 0, class: gpr64sp } +body: | + bb.0: + ; CHECK: STRXui %xzr, %stack.0, 0 :: (store 8 into %stack.0) + undef %0.sub_32 = COPY %wzr + INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp + %x0 = ADDXri %0, 1, 0 + RET_ReallyLR implicit %x0 +... +--- +# CHECK-LABEL: name: test_subreg_spill_fold3 +# Similar to test_subreg_spill_fold, but with a cross register class copy. +name: test_subreg_spill_fold3 +registers: + - { id: 0, class: fpr64 } +body: | + bb.0: + ; CHECK: STRXui %xzr, %stack.0, 0 :: (store 8 into %stack.0) + undef %0.ssub = COPY %wzr + INLINEASM $nop, 1, 12, implicit-def dead %d0, 12, implicit-def dead %d1, 12, implicit-def dead %d2, 12, implicit-def dead %d3, 12, implicit-def dead %d4, 12, implicit-def dead %d5, 12, implicit-def dead %d6, 12, implicit-def dead %d7, 12, implicit-def dead %d8, 12, implicit-def dead %d9, 12, implicit-def dead %d10, 12, implicit-def dead %d11, 12, implicit-def dead %d12, 12, implicit-def dead %d13, 12, implicit-def dead %d14, 12, implicit-def dead %d15, 12, implicit-def dead %d16, 12, implicit-def dead %d17, 12, implicit-def dead %d18, 12, implicit-def dead %d19, 12, implicit-def dead %d20, 12, implicit-def dead %d21, 12, implicit-def dead %d22, 12, implicit-def dead %d23, 12, implicit-def dead %d24, 12, implicit-def dead %d25, 12, implicit-def dead %d26, 12, implicit-def dead %d27, 12, implicit-def dead %d28, 12, implicit-def dead %d29, 12, implicit-def dead %d30, 12, implicit-def %d31 + %x0 = COPY %0 + RET_ReallyLR implicit %x0 +... +--- +# CHECK-LABEL: name: test_subreg_fill_fold +# Ensure that the filled COPY is eliminated and folded into the fill load. +name: test_subreg_fill_fold +registers: + - { id: 0, class: gpr32 } + - { id: 1, class: gpr64 } +body: | + bb.0: + %0 = COPY %wzr + INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp + ; CHECK: undef %1.sub_32 = LDRWui %stack.0, 0 :: (load 4 from %stack.0) + undef %1.sub_32 = COPY %0 + %x0 = COPY %1 + RET_ReallyLR implicit %x0 +... +--- +# CHECK-LABEL: name: test_subreg_fill_fold2 +# Similar to test_subreg_fill_fold, but with a cross-class copy. +name: test_subreg_fill_fold2 +registers: + - { id: 0, class: gpr32 } + - { id: 1, class: fpr64 } +body: | + bb.0: + %0 = COPY %wzr + INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp + ; CHECK: undef %1.ssub = LDRSui %stack.0, 0 :: (load 4 from %stack.0) + undef %1.ssub = COPY %0 + %d0 = COPY %1 + RET_ReallyLR implicit %d0 +...