Index: lib/CodeGen/MachineScheduler.cpp =================================================================== --- lib/CodeGen/MachineScheduler.cpp +++ lib/CodeGen/MachineScheduler.cpp @@ -2888,6 +2888,80 @@ return 0; } +// Check if MI (which is expected to be a COPY) has as one of its operands a +// physical register that could be allocated to the other operands virtual +// register. +static bool isCoalescablePRegCopy(MachineInstr *CopyMI, + const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + assert (CopyMI->isCopy() && "Expected a COPY"); + + MachineOperand *PRegMO = nullptr, *VRegMO = nullptr; + if (TargetRegisterInfo::isPhysicalRegister(CopyMI->getOperand(0).getReg())) { + PRegMO = &CopyMI->getOperand(0); + VRegMO = &CopyMI->getOperand(1); + } else { + PRegMO = &CopyMI->getOperand(1); + VRegMO = &CopyMI->getOperand(0); + } + if (!TargetRegisterInfo::isPhysicalRegister(PRegMO->getReg()) || + !TargetRegisterInfo::isVirtualRegister(VRegMO->getReg())) + return false; + + MCPhysReg PhysReg = PRegMO->getReg(); + unsigned VirtReg = VRegMO->getReg(); + unsigned VSub = VRegMO->getSubReg(); + + const TargetRegisterClass *VirtRC = MRI.getRegClass(VirtReg); + return (!MRI.isReserved(PhysReg) && + (VirtRC->contains(PhysReg) || + (VSub && TRI.getMatchingSuperReg(PhysReg, VSub, VirtRC)))); +} + +/// Minimize the (virtual) live ranges of copies involving phys-regs. In +/// regions with both incoming and outgoing arguments, this will reduce the +/// risk of overlapping live ranges that will hinder coalescing. In contrast +/// to biasPhysRegCopy(), this does not typically handle COPYs, but rather +/// instructions connected to a COPY involving a phys-reg. +static int copiedPhysRegsUses(const SUnit *SU, bool isTop) { + const MachineInstr *MI = SU->getInstr(); + const MachineFunction *MF = MI->getParent()->getParent(); + const MachineRegisterInfo *MRI = &MF->getRegInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + unsigned NumCopiedPhysRegUses = 0; + // Putting OP closer to the COPY minimizes the chance of %0 interfering + // with physregX: + // %0 = COPY %physregX + // ... + // %1 = OP %0 + for (const MachineOperand &MO : MI->uses()) { + if (!MO.isReg()) + continue; + MachineInstr *DefMI = MRI->getUniqueVRegDef(MO.getReg()); + if (DefMI != nullptr && DefMI->isCopy() && + DefMI->getParent() == MI->getParent() && + isCoalescablePRegCopy(DefMI, *TRI, *MRI)) + NumCopiedPhysRegUses++; + } + + // Inverse case: + // %1 = OP %0 + // ... + // %physregX = COPY %1 + if (MI->getNumOperands()) { + const MachineOperand &DefMO = MI->getOperand(0); + unsigned DefReg = ((DefMO.isReg() && DefMO.isDef()) ? DefMO.getReg() : 0); + if (DefReg && TRI->isVirtualRegister(DefReg) && MRI->hasOneUse(DefReg)) { + MachineInstr *UseMI = &*MRI->use_instr_begin(DefReg); + if (UseMI->isCopy() && UseMI->getParent() == MI->getParent() && + isCoalescablePRegCopy(UseMI, *TRI, *MRI)) + NumCopiedPhysRegUses--; + } + } + + return isTop ? NumCopiedPhysRegUses : -NumCopiedPhysRegUses; +} + void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, @@ -3030,6 +3104,12 @@ !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) return; + // Try to minimize live ranges of copied physregs. + if (tryGreater(copiedPhysRegsUses(TryCand.SU, TryCand.AtTop), + copiedPhysRegsUses(Cand.SU, Cand.AtTop), + TryCand, Cand, PhysRegCopy)) + return; + // Fall through to original instruction order. if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { Index: test/CodeGen/SystemZ/args-11.mir =================================================================== --- /dev/null +++ test/CodeGen/SystemZ/args-11.mir @@ -0,0 +1,101 @@ +# RUN: llc -o - %s -mtriple=s390x-linux-gnu -run-pass=machine-scheduler \ +# RUN: -debug-only=machine-scheduler 2>&1 | FileCheck %s + +# Test that an extra COPY due to poor pre-RA scheduling is avoided. This +# would happen if the '%7 = ADJDYNALLOC ...' is scheduled above the three +# ADJDYNALLOCs using %1 (%r3d dependency). + +--- | + + declare i64 @bar(i8*, i8*, i8*, i8*, i8*, i64, i64) + + define i64 @f1(i64 %length, i64 %index) { + %a = alloca i8, i64 %length + %b = getelementptr i8, i8* %a, i64 1 + %cindex = add i64 %index, 3919 + %c = getelementptr i8, i8* %a, i64 %cindex + %dindex = add i64 %index, 3920 + %d = getelementptr i8, i8* %a, i64 %dindex + %eindex = add i64 %index, 4095 + %e = getelementptr i8, i8* %a, i64 %eindex + %count = call i64 @bar(i8* %a, i8* %b, i8* %c, i8* %d, i8* %e, i64 0, i64 0) + %res = add i64 %count, 1 + ret i64 %res + } + +... + +# CHECK: ********** MI Scheduling ********** +# CHECK: f1:BB#0 +# CHECK: From: %vreg7 = ADJDYNALLOC %vreg5, 1, %noreg; GR64Bit:%vreg7 ADDR64Bit:%vreg5 +# CHECK: To: CallBRASL +# CHECK: *** Final schedule for BB#0 *** +# CHECK: SU(1): %vreg8 = ADJDYNALLOC %vreg5, 3919, %vreg1; GR64Bit:%vreg8 ADDR64Bit:%vreg5,%vreg1 +# CHECK: SU(2): %vreg9 = ADJDYNALLOC %vreg5, 3920, %vreg1; GR64Bit:%vreg9 ADDR64Bit:%vreg5,%vreg1 +# CHECK: SU(3): %vreg10 = ADJDYNALLOC %vreg5, 4095, %vreg1; GR64Bit:%vreg10 ADDR64Bit:%vreg5,%vreg1 +# CHECK: SU(0): %vreg7 = ADJDYNALLOC %vreg5, 1, %noreg; GR64Bit:%vreg7 ADDR64Bit:%vreg5 +# +# CHECK: ********** MI Scheduling ********** +# CHECK: f1:BB#0 +# CHECK: From: %vreg1 = COPY %R3D; ADDR64Bit:%vreg1 +# CHECK: To: %R15D = COPY %vreg5; ADDR64Bit:%vreg5 + +--- +name: f1 +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: addr64bit } + - { id: 1, class: addr64bit } + - { id: 2, class: gr64bit } + - { id: 3, class: gr64bit } + - { id: 4, class: gr64bit } + - { id: 5, class: addr64bit } + - { id: 6, class: gr64bit } + - { id: 7, class: gr64bit } + - { id: 8, class: gr64bit } + - { id: 9, class: gr64bit } + - { id: 10, class: gr64bit } + - { id: 11, class: addr64bit } + - { id: 12, class: addr64bit } + - { id: 13, class: gr64bit } +liveins: + - { reg: '%r2d', virtual-reg: '%0' } + - { reg: '%r3d', virtual-reg: '%1' } +frameInfo: + maxAlignment: 8 + hasCalls: true +stack: + - { id: 0, name: a, type: variable-sized, alignment: 1, stack-id: 0 } +body: | + bb.0 (%ir-block.0): + liveins: %r2d, %r3d + + %1 = COPY %r3d + %0 = COPY %r2d + %3 = LA %0, 7, _ + %3 = NILL64 %3, 65528, implicit-def dead %cc + %5 = COPY %r15d + %5 = SGR %5, %3, implicit-def dead %cc + %6 = ADJDYNALLOC %5, 0, _ + %r15d = COPY %5 + %7 = ADJDYNALLOC %5, 1, _ + %8 = ADJDYNALLOC %5, 3919, %1 + %9 = ADJDYNALLOC %5, 3920, %1 + %10 = ADJDYNALLOC %5, 4095, %1 + ADJCALLSTACKDOWN 16, 0 + MVGHI %r15d, 168, 0 :: (store 8) + MVGHI %r15d, 160, 0 :: (store 8) + %r2d = COPY %6 + %r3d = COPY %7 + %r4d = COPY %8 + %r5d = COPY %9 + %r6d = COPY %10 + CallBRASL @bar, %r2d, %r3d, killed %r4d, killed %r5d, killed %r6d, csr_systemz, implicit-def dead %r14d, implicit-def dead %cc, implicit-def %r2d + ADJCALLSTACKUP 16, 0 + %12 = COPY %r2d + %13 = LA %12, 1, _ + %r2d = COPY %13 + Return implicit %r2d + +...