Index: include/llvm/CodeGen/MachineScheduler.h =================================================================== --- include/llvm/CodeGen/MachineScheduler.h +++ include/llvm/CodeGen/MachineScheduler.h @@ -791,7 +791,7 @@ enum CandReason : uint8_t { NoCand, Only1, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, - TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder}; + TopDepthReduce, TopPathReduce, NextDefUse, PhysRegCp2, NodeOrder}; #ifndef NDEBUG static const char *getReasonStr(GenericSchedulerBase::CandReason Reason); Index: lib/CodeGen/MachineScheduler.cpp =================================================================== --- lib/CodeGen/MachineScheduler.cpp +++ lib/CodeGen/MachineScheduler.cpp @@ -2548,6 +2548,7 @@ case BotHeightReduce:return "BOT-HEIGHT"; case BotPathReduce: return "BOT-PATH "; case NextDefUse: return "DEF-USE "; + case PhysRegCp2: return "PREG-CP-2 "; case NodeOrder: return "ORDER "; }; llvm_unreachable("Unknown reason!"); @@ -2888,6 +2889,152 @@ return 0; } +// Check if MI (which is expected to be a COPY) has as one of its operands a +// physical register that could be allocated to the other operands virtual +// register. +static bool isCoalescablePRegCopy(MachineInstr &CopyMI, + const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + assert (CopyMI.isCopy() && "Expected a COPY"); + + MachineOperand *PRegMO = nullptr, *VRegMO = nullptr; + if (TargetRegisterInfo::isPhysicalRegister(CopyMI.getOperand(0).getReg())) { + PRegMO = &CopyMI.getOperand(0); + VRegMO = &CopyMI.getOperand(1); + } else { + PRegMO = &CopyMI.getOperand(1); + VRegMO = &CopyMI.getOperand(0); + } + if (!TargetRegisterInfo::isPhysicalRegister(PRegMO->getReg()) || + !TargetRegisterInfo::isVirtualRegister(VRegMO->getReg())) + return false; + + MCPhysReg PhysReg = PRegMO->getReg(); + unsigned VirtReg = VRegMO->getReg(); + unsigned VSub = VRegMO->getSubReg(); + + const TargetRegisterClass *VirtRC = MRI.getRegClass(VirtReg); + return (!MRI.isReserved(PhysReg) && + (VirtRC->contains(PhysReg) || + (VSub && TRI.getMatchingSuperReg(PhysReg, VSub, VirtRC)))); +} + +/// Values collected by findConnectedPhysRegs() for an SU. +struct CandPRegs { + /// UsedPReg is the copy-connected preg used by the candidate. + unsigned UsedPReg = 0; + /// DefedPReg is the copy-connected preg defed by the candidate. + unsigned DefedPReg = 0; + /// NumCopied reflects the impact of copy-connected physregs, adjusted for + /// Top/Bottom. + int NumCopied = 0; +}; + +/// Find the copy-connected physregs of SU and the heuristical sum of them +/// (for a top region, it is incremented for a use and decremented for a +/// def). +static void findConnectedPhysRegs(const SUnit &SU, bool isTop, CandPRegs &PRegs) { + const MachineInstr *MI = SU.getInstr(); + const MachineFunction *MF = MI->getParent()->getParent(); + const MachineRegisterInfo *MRI = &MF->getRegInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + unsigned NumCopiedPhysRegUses = 0; + // Putting OP closer to the COPY minimizes the chance of %0 interfering + // with physregX: + // %0 = COPY %physregX + // ... + // %1 = OP %0 + for (const MachineOperand &MO : MI->uses()) { + if (!MO.isReg()) + continue; + + MachineInstr *DefMI = MRI->getUniqueVRegDef(MO.getReg()); + if (DefMI != nullptr && DefMI->isCopy() && + DefMI->getParent() == MI->getParent() && + isCoalescablePRegCopy(*DefMI, *TRI, *MRI)) { + if (MRI->hasOneUse(MO.getReg())) + // Make this a bit less agressive by checking for one use. + NumCopiedPhysRegUses++; + PRegs.UsedPReg = DefMI->getOperand(1).getReg(); + } + } + + // Inverse case: + // %1 = OP %0 + // ... + // %physregX = COPY %1 + if (MI->getNumOperands()) { + // Don't move a copy of a subreg of a 128bit register away from the wide + // definition, or regalloc might run out of registers. + bool SkipWideCopySubReg = + (MI->isCopy() && MI->getOperand(1).getSubReg() && + TRI->getRegSizeInBits(*MRI->getRegClass(MI->getOperand(1).getReg())) > 64); + const MachineOperand &DefMO = MI->getOperand(0); + unsigned DefReg = ((DefMO.isReg() && DefMO.isDef()) ? DefMO.getReg() : 0); + if (DefReg && TRI->isVirtualRegister(DefReg) && MRI->hasOneUse(DefReg) && + !SkipWideCopySubReg) { + MachineInstr *UseMI = &*MRI->use_instr_begin(DefReg); + if (UseMI->isCopy() && UseMI->getParent() == MI->getParent() && + isCoalescablePRegCopy(*UseMI, *TRI, *MRI)) { + NumCopiedPhysRegUses--; + PRegs.DefedPReg = UseMI->getOperand(0).getReg(); + } + } + } + + PRegs.NumCopied = (isTop ? NumCopiedPhysRegUses : -NumCopiedPhysRegUses); +} + +/// Minimize the (virtual) live ranges of copies involving phys-regs. In +/// regions with both incoming and outgoing arguments, this will reduce the +/// risk of overlapping live ranges that will hinder coalescing. In contrast +/// to biasPhysRegCopy(), this does not typically handle COPYs, but rather +/// instructions connected to a COPY involving a phys-reg. +static bool tryPhysRegCopies2(GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand) { + assert (TryCand.AtTop == Cand.AtTop && + "Expected two candidates from same boundary."); + + // Find the copy-connected physregs. + CandPRegs TryCandPRegs, CandPRegs; + findConnectedPhysRegs(*TryCand.SU, TryCand.AtTop, TryCandPRegs); + findConnectedPhysRegs(*Cand.SU, Cand.AtTop, CandPRegs); + + // First try to handle the case where the two candidates have overlapping + // live-ranges connected to the same physreg. %1 and %2 should be + // reordered: + // %0 = COPY %physregX + // %1 = OP + // %2 = OP (use %0) + // %physregX = %1 + if (TryCand.AtTop) { + if (TryCandPRegs.UsedPReg && TryCandPRegs.UsedPReg == CandPRegs.DefedPReg) { + TryCand.Reason = GenericSchedulerBase::PhysRegCp2; + return true; + } + if (CandPRegs.UsedPReg && CandPRegs.UsedPReg == TryCandPRegs.DefedPReg) { + if (Cand.Reason > GenericSchedulerBase::PhysRegCp2) + Cand.Reason = GenericSchedulerBase::PhysRegCp2; + return true; + } + } else { + if (TryCandPRegs.DefedPReg && TryCandPRegs.DefedPReg == CandPRegs.UsedPReg) { + TryCand.Reason = GenericSchedulerBase::PhysRegCp2; + return true; + } + if (CandPRegs.DefedPReg && CandPRegs.DefedPReg == TryCandPRegs.UsedPReg) { + if (Cand.Reason > GenericSchedulerBase::PhysRegCp2) + Cand.Reason = GenericSchedulerBase::PhysRegCp2; + return true; + } + } + + // Prefer the candidate that has the most(top) / least(bot) number of copied + // phys-regs, to minimize those live-ranges. + return tryGreater(TryCandPRegs.NumCopied, CandPRegs.NumCopied, + TryCand, Cand, GenericSchedulerBase::PhysRegCp2); +} + void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, @@ -3030,6 +3177,10 @@ !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) return; + // Try to minimize live ranges of copied physregs. + if (tryPhysRegCopies2(TryCand, Cand)) + return; + // Fall through to original instruction order. if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { Index: test/CodeGen/SystemZ/alloca-01.ll =================================================================== --- test/CodeGen/SystemZ/alloca-01.ll +++ test/CodeGen/SystemZ/alloca-01.ll @@ -13,10 +13,6 @@ ; Allocate %length bytes and take addresses based on the result. ; There are two stack arguments, so an offset of 160 + 2 * 8 == 176 ; is added to the copy of %r15. -; -; NOTE: 'la %r0, 177(%r1)' is actually an expected fail as it would -; be better (and possible) to load into %r3 directly. -; define i64 @f1(i64 %length, i64 %index) { ; FIXME: a better sequence would be: ; @@ -38,7 +34,7 @@ ; ; CHECK-B-LABEL: f1: ; CHECK-B: lgr %r15, %r1 -; CHECK-B: la %r0, 177(%r1) +; CHECK-B: la %r3, 177(%r1) ; ; CHECK-C-LABEL: f1: ; CHECK-C: lgr %r15, %r1 Index: test/CodeGen/SystemZ/args-06.ll =================================================================== --- test/CodeGen/SystemZ/args-06.ll +++ test/CodeGen/SystemZ/args-06.ll @@ -5,10 +5,10 @@ define i8 @f1(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g) { ; CHECK-LABEL: f1: -; CHECK: lb {{%r[0-5]}}, 175(%r15) -; CHECK: lb {{%r[0-5]}}, 167(%r15) -; CHECK: ar %r2, %r3 -; CHECK: ar %r2, %r4 +; CHECK-DAG: lb {{%r[0-5]}}, 175(%r15) +; CHECK-DAG: lb {{%r[0-5]}}, 167(%r15) +; CHECK-DAG: ar %r2, %r3 +; CHECK-DAG: ar %r2, %r4 ; CHECK: ar %r2, %r5 ; CHECK: ar %r2, %r6 ; CHECK: br %r14 Index: test/CodeGen/SystemZ/args-11.mir =================================================================== --- /dev/null +++ test/CodeGen/SystemZ/args-11.mir @@ -0,0 +1,101 @@ +# RUN: llc -o - %s -mtriple=s390x-linux-gnu -run-pass=machine-scheduler \ +# RUN: -debug-only=machine-scheduler 2>&1 | FileCheck %s + +# Test that an extra COPY due to poor pre-RA scheduling is avoided. This +# would happen if the '%7 = ADJDYNALLOC ...' is scheduled above the three +# ADJDYNALLOCs using %1 (%r3d dependency). + +--- | + + declare i64 @bar(i8*, i8*, i8*, i8*, i8*, i64, i64) + + define i64 @f1(i64 %length, i64 %index) { + %a = alloca i8, i64 %length + %b = getelementptr i8, i8* %a, i64 1 + %cindex = add i64 %index, 3919 + %c = getelementptr i8, i8* %a, i64 %cindex + %dindex = add i64 %index, 3920 + %d = getelementptr i8, i8* %a, i64 %dindex + %eindex = add i64 %index, 4095 + %e = getelementptr i8, i8* %a, i64 %eindex + %count = call i64 @bar(i8* %a, i8* %b, i8* %c, i8* %d, i8* %e, i64 0, i64 0) + %res = add i64 %count, 1 + ret i64 %res + } + +... + +# CHECK: ********** MI Scheduling ********** +# CHECK: f1:BB#0 +# CHECK: From: %vreg7 = ADJDYNALLOC %vreg5, 1, %noreg; GR64Bit:%vreg7 ADDR64Bit:%vreg5 +# CHECK: To: CallBRASL +# CHECK: *** Final schedule for BB#0 *** +# CHECK: SU(1): %vreg8 = ADJDYNALLOC %vreg5, 3919, %vreg1; GR64Bit:%vreg8 ADDR64Bit:%vreg5,%vreg1 +# CHECK: SU(2): %vreg9 = ADJDYNALLOC %vreg5, 3920, %vreg1; GR64Bit:%vreg9 ADDR64Bit:%vreg5,%vreg1 +# CHECK: SU(3): %vreg10 = ADJDYNALLOC %vreg5, 4095, %vreg1; GR64Bit:%vreg10 ADDR64Bit:%vreg5,%vreg1 +# CHECK: SU(0): %vreg7 = ADJDYNALLOC %vreg5, 1, %noreg; GR64Bit:%vreg7 ADDR64Bit:%vreg5 +# +# CHECK: ********** MI Scheduling ********** +# CHECK: f1:BB#0 +# CHECK: From: %vreg1 = COPY %R3D; ADDR64Bit:%vreg1 +# CHECK: To: %R15D = COPY %vreg5; ADDR64Bit:%vreg5 + +--- +name: f1 +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: addr64bit } + - { id: 1, class: addr64bit } + - { id: 2, class: gr64bit } + - { id: 3, class: gr64bit } + - { id: 4, class: gr64bit } + - { id: 5, class: addr64bit } + - { id: 6, class: gr64bit } + - { id: 7, class: gr64bit } + - { id: 8, class: gr64bit } + - { id: 9, class: gr64bit } + - { id: 10, class: gr64bit } + - { id: 11, class: addr64bit } + - { id: 12, class: addr64bit } + - { id: 13, class: gr64bit } +liveins: + - { reg: '%r2d', virtual-reg: '%0' } + - { reg: '%r3d', virtual-reg: '%1' } +frameInfo: + maxAlignment: 8 + hasCalls: true +stack: + - { id: 0, name: a, type: variable-sized, alignment: 1, stack-id: 0 } +body: | + bb.0 (%ir-block.0): + liveins: %r2d, %r3d + + %1 = COPY %r3d + %0 = COPY %r2d + %3 = LA %0, 7, _ + %3 = NILL64 %3, 65528, implicit-def dead %cc + %5 = COPY %r15d + %5 = SGR %5, %3, implicit-def dead %cc + %6 = ADJDYNALLOC %5, 0, _ + %r15d = COPY %5 + %7 = ADJDYNALLOC %5, 1, _ + %8 = ADJDYNALLOC %5, 3919, %1 + %9 = ADJDYNALLOC %5, 3920, %1 + %10 = ADJDYNALLOC %5, 4095, %1 + ADJCALLSTACKDOWN 16, 0 + MVGHI %r15d, 168, 0 :: (store 8) + MVGHI %r15d, 160, 0 :: (store 8) + %r2d = COPY %6 + %r3d = COPY %7 + %r4d = COPY %8 + %r5d = COPY %9 + %r6d = COPY %10 + CallBRASL @bar, %r2d, %r3d, killed %r4d, killed %r5d, killed %r6d, csr_systemz, implicit-def dead %r14d, implicit-def dead %cc, implicit-def %r2d + ADJCALLSTACKUP 16, 0 + %12 = COPY %r2d + %13 = LA %12, 1, _ + %r2d = COPY %13 + Return implicit %r2d + +... Index: test/CodeGen/SystemZ/cond-move-02.ll =================================================================== --- test/CodeGen/SystemZ/cond-move-02.ll +++ test/CodeGen/SystemZ/cond-move-02.ll @@ -4,8 +4,8 @@ define i32 @f1(i32 %x) { ; CHECK-LABEL: f1: -; CHECK: lhi [[REG:%r[0-5]]], 0 -; CHECK: chi %r2, 0 +; CHECK-DAG: lhi [[REG:%r[0-5]]], 0 +; CHECK-DAG: chi %r2, 0 ; CHECK: lochilh [[REG]], 42 ; CHECK: br %r14 %cond = icmp ne i32 %x, 0 @@ -35,8 +35,8 @@ define i64 @f4(i64 %x) { ; CHECK-LABEL: f4: -; CHECK: lghi [[REG:%r[0-5]]], 0 -; CHECK: cghi %r2, 0 +; CHECK-DAG: lghi [[REG:%r[0-5]]], 0 +; CHECK-DAG: cghi %r2, 0 ; CHECK: locghilh [[REG]], 42 ; CHECK: br %r14 %cond = icmp ne i64 %x, 0 Index: test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll =================================================================== --- test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll +++ test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll @@ -64,7 +64,7 @@ ; CHECK-DAG: vceqh [[REG4:%v[0-9]+]], %v30, %v27 ; CHECK-DAG: vl [[REG5:%v[0-9]+]], 176(%r15) ; CHECK-DAG: vl [[REG6:%v[0-9]+]], 160(%r15) -; CHECK-DAG: vo [[REG7:%v[0-9]+]], %v2, [[REG4]] +; CHECK-DAG: vo [[REG7:%v[0-9]+]], [[REG1]], [[REG4]] ; CHECK-DAG: vo [[REG8:%v[0-9]+]], [[REG2]], [[REG3]] ; CHECK-DAG: vsel %v24, %v29, [[REG6]], [[REG8]] ; CHECK-DAG: vsel %v26, %v31, [[REG5]], [[REG7]] @@ -185,7 +185,7 @@ ; CHECK-DAG: vceqh [[REG0:%v[0-9]+]], %v24, %v26 ; CHECK-DAG: vceqh [[REG1:%v[0-9]+]], %v28, %v30 ; CHECK-NEXT: vx [[REG2:%v[0-9]+]], [[REG0]], [[REG1]] -; CHECK-DAG: vuphh [[REG3:%v[0-9]+]], [[REG2]] +; CHECK-NEXT: vuphh [[REG3:%v[0-9]+]], [[REG2]] ; CHECK-DAG: vmrlg [[REG4:%v[0-9]+]], [[REG2]], [[REG2]] ; CHECK-DAG: vuphh [[REG5:%v[0-9]+]], [[REG4]] ; CHECK-NEXT: vsel %v24, %v25, %v29, [[REG3]] @@ -346,7 +346,7 @@ ; CHECK-NEXT: vceqf %v0, %v24, %v26 ; CHECK-NEXT: vuphh %v1, %v1 ; CHECK-NEXT: vn %v0, %v0, %v1 -; CHECK-DAG: vuphf [[REG0:%v[0-9]+]], %v0 +; CHECK-NEXT: vuphf [[REG0:%v[0-9]+]], %v0 ; CHECK-DAG: vmrlg [[REG1:%v[0-9]+]], %v0, %v0 ; CHECK-DAG: vuphf [[REG2:%v[0-9]+]], [[REG1]] ; CHECK-NEXT: vsel %v24, %v25, %v29, [[REG0]] @@ -439,9 +439,9 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vceqg %v0, %v26, %v30 ; CHECK-NEXT: vceqg %v1, %v24, %v28 -; CHECK-NEXT: vpkg %v0, %v1, %v0 -; CHECK-NEXT: vceqf %v1, %v25, %v27 -; CHECK-NEXT: vx %v0, %v0, %v1 +; CHECK-DAG: vpkg %v0, %v1, %v0 +; CHECK-DAG: vceqf [[REG0:%v[0-9]+]], %v25, %v27 +; CHECK-NEXT: vx %v0, %v0, [[REG0]] ; CHECK-NEXT: vsel %v24, %v29, %v31, %v0 ; CHECK-NEXT: br %r14 %cmp0 = icmp eq <4 x i64> %val1, %val2 @@ -477,20 +477,20 @@ define <2 x float> @fun25(<2 x float> %val1, <2 x float> %val2, <2 x double> %val3, <2 x double> %val4, <2 x float> %val5, <2 x float> %val6) { ; CHECK-LABEL: fun25: ; CHECK: # BB#0: -; CHECK-NEXT: vmrlf %v0, %v26, %v26 -; CHECK-NEXT: vmrlf %v1, %v24, %v24 -; CHECK-NEXT: vldeb %v0, %v0 -; CHECK-NEXT: vldeb %v1, %v1 -; CHECK-NEXT: vfchdb %v0, %v1, %v0 -; CHECK-NEXT: vmrhf %v1, %v26, %v26 -; CHECK-NEXT: vmrhf %v2, %v24, %v24 -; CHECK-NEXT: vldeb %v1, %v1 -; CHECK-NEXT: vldeb %v2, %v2 -; CHECK-NEXT: vfchdb %v1, %v2, %v1 -; CHECK-NEXT: vpkg %v0, %v1, %v0 -; CHECK-NEXT: vfchdb %v1, %v28, %v30 -; CHECK-NEXT: vpkg %v1, %v1, %v1 -; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vmrlf [[REG0:%v[0-9]+]], %v26, %v26 +; CHECK-NEXT: vmrlf [[REG1:%v[0-9]+]], %v24, %v24 +; CHECK-NEXT: vldeb [[REG0]], [[REG0]] +; CHECK-NEXT: vldeb [[REG1]], [[REG1]] +; CHECK-NEXT: vfchdb [[REG5:%v[0-9]+]], [[REG1]], [[REG0]] +; CHECK-NEXT: vmrhf [[REG2:%v[0-9]+]], %v26, %v26 +; CHECK-NEXT: vmrhf [[REG3:%v[0-9]+]], %v24, %v24 +; CHECK-NEXT: vldeb [[REG2]], [[REG2]] +; CHECK-DAG: vldeb [[REG3]], [[REG3]] +; CHECK-DAG: vfchdb [[REG4:%v[0-9]+]], [[REG3]], [[REG2]] +; CHECK-DAG: vpkg [[REG7:%v[0-9]+]], [[REG4]], [[REG5]] +; CHECK-DAG: vfchdb [[REG6:%v[0-9]+]], %v28, %v30 +; CHECK-DAG: vpkg [[REG6]], [[REG6]], [[REG6]] +; CHECK-NEXT: vo %v0, [[REG7]], [[REG6]] ; CHECK-NEXT: vsel %v24, %v25, %v27, %v0 ; CHECK-NEXT: br %r14 ; @@ -512,28 +512,28 @@ define <2 x double> @fun26(<2 x float> %val1, <2 x float> %val2, <2 x double> %val3, <2 x double> %val4, <2 x double> %val5, <2 x double> %val6) { ; CHECK-LABEL: fun26: ; CHECK: # BB#0: -; CHECK-NEXT: vmrlf %v0, %v26, %v26 -; CHECK-NEXT: vmrlf %v1, %v24, %v24 -; CHECK-NEXT: vldeb %v0, %v0 -; CHECK-NEXT: vldeb %v1, %v1 -; CHECK-NEXT: vfchdb %v0, %v1, %v0 -; CHECK-NEXT: vmrhf %v1, %v26, %v26 -; CHECK-NEXT: vmrhf %v2, %v24, %v24 -; CHECK-NEXT: vldeb %v1, %v1 -; CHECK-NEXT: vldeb %v2, %v2 -; CHECK-NEXT: vfchdb %v1, %v2, %v1 -; CHECK-NEXT: vpkg %v0, %v1, %v0 -; CHECK-NEXT: vuphf %v0, %v0 -; CHECK-NEXT: vfchdb %v1, %v28, %v30 -; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vmrlf [[REG0:%v[0-9]+]], %v26, %v26 +; CHECK-NEXT: vmrlf [[REG1:%v[0-9]+]], %v24, %v24 +; CHECK-NEXT: vldeb [[REG0]], [[REG0]] +; CHECK-NEXT: vldeb [[REG1]], [[REG1]] +; CHECK-NEXT: vfchdb [[REG2:%v[0-9]+]], [[REG1]], [[REG0]] +; CHECK-NEXT: vmrhf [[REG3:%v[0-9]+]], %v26, %v26 +; CHECK-NEXT: vmrhf [[REG4:%v[0-9]+]], %v24, %v24 +; CHECK-NEXT: vldeb [[REG3]], [[REG3]] +; CHECK-DAG: vldeb [[REG4]], [[REG4]] +; CHECK-DAG: vfchdb [[REG5:%v[0-9]+]], [[REG4]], [[REG3]] +; CHECK-DAG: vpkg [[REG6:%v[0-9]+]], [[REG5]], [[REG2]] +; CHECK-DAG: vuphf [[REG6]], [[REG6]] +; CHECK-DAG: vfchdb [[REG7:%v[0-9]+]], %v28, %v30 +; CHECK-NEXT: vo %v0, [[REG6]], [[REG7]] ; CHECK-NEXT: vsel %v24, %v25, %v27, %v0 ; CHECK-NEXT: br %r14 ; ; CHECK-Z14-LABEL: fun26: ; CHECK-Z14: # BB#0: ; CHECK-Z14-NEXT: vfchsb %v0, %v24, %v26 -; CHECK-Z14-NEXT: vuphf %v0, %v0 -; CHECK-Z14-NEXT: vfchdb %v1, %v28, %v30 +; CHECK-Z14-DAG: vuphf %v0, %v0 +; CHECK-Z14-DAG: vfchdb %v1, %v28, %v30 ; CHECK-Z14-NEXT: vo %v0, %v0, %v1 ; CHECK-Z14-NEXT: vsel %v24, %v25, %v27, %v0 ; CHECK-Z14-NEXT: br %r14 @@ -630,12 +630,12 @@ ; CHECK-NEXT: vldeb %v3, %v3 ; CHECK-NEXT: vfchdb %v2, %v3, %v2 ; CHECK-NEXT: vpkg %v1, %v2, %v1 -; CHECK-NEXT: vx %v0, %v0, %v1 -; CHECK-NEXT: vmrlg %v1, %v0, %v0 -; CHECK-NEXT: vuphf %v1, %v1 -; CHECK-NEXT: vuphf %v0, %v0 -; CHECK-NEXT: vsel %v24, %v25, %v29, %v0 -; CHECK-NEXT: vsel %v26, %v27, %v31, %v1 +; CHECK-NEXT: vx [[REG1:%v[0-9]+]], %v0, %v1 +; CHECK-DAG: vmrlg [[REG0:%v[0-9]+]], [[REG1]], [[REG1]] +; CHECK-DAG: vuphf [[REG2:%v[0-9]+]], [[REG1]] +; CHECK-DAG: vuphf [[REG0]], [[REG0]] +; CHECK-NEXT: vsel %v24, %v25, %v29, [[REG2]] +; CHECK-NEXT: vsel %v26, %v27, %v31, [[REG0]] ; CHECK-NEXT: br %r14 ; ; CHECK-Z14-LABEL: fun29: @@ -643,11 +643,11 @@ ; CHECK-Z14-NEXT: vfchsb %v0, %v24, %v26 ; CHECK-Z14-NEXT: vfchsb %v1, %v28, %v30 ; CHECK-Z14-NEXT: vx %v0, %v0, %v1 -; CHECK-Z14-NEXT: vmrlg %v1, %v0, %v0 -; CHECK-Z14-NEXT: vuphf %v1, %v1 -; CHECK-Z14-NEXT: vuphf %v0, %v0 -; CHECK-Z14-NEXT: vsel %v24, %v25, %v29, %v0 -; CHECK-Z14-NEXT: vsel %v26, %v27, %v31, %v1 +; CHECK-Z14-DAG: vmrlg [[REG0:%v[0-9]+]], %v0, %v0 +; CHECK-Z14-DAG: vuphf [[REG0]], [[REG0]] +; CHECK-Z14-DAG: vuphf [[REG1:%v[0-9]+]], %v0 +; CHECK-Z14-NEXT: vsel %v24, %v25, %v29, [[REG1]] +; CHECK-Z14-NEXT: vsel %v26, %v27, %v31, [[REG0]] ; CHECK-Z14-NEXT: br %r14 %cmp0 = fcmp ogt <4 x float> %val1, %val2 %cmp1 = fcmp ogt <4 x float> %val3, %val4 @@ -667,28 +667,28 @@ ; CHECK-NEXT: vmrhf %v17, %v28, %v28 ; CHECK-NEXT: vmrhf %v18, %v24, %v24 ; CHECK-NEXT: vldeb %v17, %v17 -; CHECK-NEXT: vl %v4, 192(%r15) -; CHECK-NEXT: vldeb %v18, %v18 -; CHECK-NEXT: vl %v5, 208(%r15) -; CHECK-NEXT: vl %v6, 160(%r15) -; CHECK-NEXT: vl %v7, 176(%r15) -; CHECK-NEXT: vl %v0, 272(%r15) -; CHECK-NEXT: vl %v1, 240(%r15) -; CHECK-NEXT: vfchdb %v17, %v18, %v17 -; CHECK-NEXT: vl %v2, 256(%r15) -; CHECK-NEXT: vl %v3, 224(%r15) +; CHECK-DAG: vl [[REG0:%v[0-9]+]], 192(%r15) +; CHECK-DAG: vldeb %v18, %v18 +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 208(%r15) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 160(%r15) +; CHECK-DAG: vl [[REG3:%v[0-9]+]], 176(%r15) +; CHECK-DAG: vl [[REG4:%v[0-9]+]], 272(%r15) +; CHECK-DAG: vl [[REG5:%v[0-9]+]], 240(%r15) +; CHECK-DAG: vfchdb %v17, %v18, %v17 +; CHECK-DAG: vl [[REG6:%v[0-9]+]], 256(%r15) +; CHECK-DAG: vl [[REG7:%v[0-9]+]], 224(%r15) ; CHECK-NEXT: vpkg %v16, %v17, %v16 ; CHECK-NEXT: vmrlf %v17, %v30, %v30 ; CHECK-NEXT: vmrlf %v18, %v26, %v26 ; CHECK-NEXT: vmrhf %v19, %v26, %v26 -; CHECK-NEXT: vfchdb %v7, %v27, %v7 -; CHECK-NEXT: vfchdb %v6, %v25, %v6 -; CHECK-NEXT: vfchdb %v5, %v31, %v5 -; CHECK-NEXT: vfchdb %v4, %v29, %v4 -; CHECK-NEXT: vpkg %v6, %v6, %v7 -; CHECK-NEXT: vpkg %v4, %v4, %v5 -; CHECK-NEXT: vn %v5, %v16, %v6 -; CHECK-NEXT: vsel %v24, %v3, %v2, %v5 +; CHECK-NEXT: vfchdb [[REG8:%v[0-9]+]], %v27, [[REG3]] +; CHECK-NEXT: vfchdb [[REG9:%v[0-9]+]], %v25, [[REG2]] +; CHECK-NEXT: vfchdb [[REG10:%v[0-9]+]], %v31, [[REG1]] +; CHECK-NEXT: vfchdb [[REG11:%v[0-9]+]], %v29, [[REG0]] +; CHECK-NEXT: vpkg [[REG12:%v[0-9]+]], [[REG9]], [[REG8]] +; CHECK-NEXT: vpkg [[REG13:%v[0-9]+]], [[REG11]], [[REG10]] +; CHECK-NEXT: vn [[REG14:%v[0-9]+]], %v16, [[REG12]] +; CHECK-NEXT: vsel %v24, [[REG7]], [[REG6]], [[REG14]] ; CHECK-NEXT: vldeb %v17, %v17 ; CHECK-NEXT: vldeb %v18, %v18 ; CHECK-NEXT: vfchdb %v17, %v18, %v17 @@ -697,32 +697,32 @@ ; CHECK-NEXT: vldeb %v19, %v19 ; CHECK-NEXT: vfchdb %v18, %v19, %v18 ; CHECK-NEXT: vpkg %v17, %v18, %v17 -; CHECK-NEXT: vn %v4, %v17, %v4 -; CHECK-NEXT: vsel %v26, %v1, %v0, %v4 +; CHECK-NEXT: vn [[REG15:%v[0-9]+]], %v17, [[REG13]] +; CHECK-NEXT: vsel %v26, [[REG5]], [[REG4]], [[REG15]] ; CHECK-NEXT: br %r14 ; ; CHECK-Z14-LABEL: fun30: ; CHECK-Z14: # BB#0: -; CHECK-Z14-NEXT: vl %v4, 192(%r15) -; CHECK-Z14-NEXT: vl %v5, 208(%r15) -; CHECK-Z14-NEXT: vl %v6, 160(%r15) -; CHECK-Z14-NEXT: vl %v7, 176(%r15) -; CHECK-Z14-NEXT: vfchdb %v7, %v27, %v7 -; CHECK-Z14-NEXT: vfchdb %v6, %v25, %v6 -; CHECK-Z14-NEXT: vfchdb %v5, %v31, %v5 -; CHECK-Z14-NEXT: vfchdb %v4, %v29, %v4 -; CHECK-Z14-NEXT: vfchsb %v16, %v24, %v28 -; CHECK-Z14-NEXT: vfchsb %v17, %v26, %v30 -; CHECK-Z14-NEXT: vpkg %v6, %v6, %v7 -; CHECK-Z14-NEXT: vpkg %v4, %v4, %v5 -; CHECK-Z14-NEXT: vl %v0, 272(%r15) -; CHECK-Z14-NEXT: vl %v1, 240(%r15) -; CHECK-Z14-NEXT: vl %v2, 256(%r15) -; CHECK-Z14-NEXT: vl %v3, 224(%r15) -; CHECK-Z14-NEXT: vn %v4, %v17, %v4 -; CHECK-Z14-NEXT: vn %v5, %v16, %v6 -; CHECK-Z14-NEXT: vsel %v24, %v3, %v2, %v5 -; CHECK-Z14-NEXT: vsel %v26, %v1, %v0, %v4 +; CHECK-Z14-DAG: vl [[REG0:%v[0-9]+]], 192(%r15) +; CHECK-Z14-DAG: vl [[REG1:%v[0-9]+]], 208(%r15) +; CHECK-Z14-DAG: vl [[REG2:%v[0-9]+]], 160(%r15) +; CHECK-Z14-DAG: vl [[REG3:%v[0-9]+]], 176(%r15) +; CHECK-Z14-NEXT: vfchdb [[REG4:%v[0-9]+]], %v27, [[REG3]] +; CHECK-Z14-NEXT: vfchdb [[REG5:%v[0-9]+]], %v25, [[REG2]] +; CHECK-Z14-NEXT: vfchdb [[REG6:%v[0-9]+]], %v31, [[REG1]] +; CHECK-Z14-NEXT: vfchdb [[REG7:%v[0-9]+]], %v29, [[REG0]] +; CHECK-Z14-NEXT: vfchsb [[REG8:%v[0-9]+]], %v24, %v28 +; CHECK-Z14-NEXT: vfchsb [[REG9:%v[0-9]+]], %v26, %v30 +; CHECK-Z14-NEXT: vpkg [[REG10:%v[0-9]+]], [[REG5]], [[REG4]] +; CHECK-Z14-NEXT: vpkg [[REG11:%v[0-9]+]], [[REG7]], [[REG6]] +; CHECK-Z14-NEXT: vl [[REG12:%v[0-9]+]], 272(%r15) +; CHECK-Z14-NEXT: vl [[REG13:%v[0-9]+]], 240(%r15) +; CHECK-Z14-NEXT: vl [[REG14:%v[0-9]+]], 256(%r15) +; CHECK-Z14-NEXT: vl [[REG15:%v[0-9]+]], 224(%r15) +; CHECK-Z14-NEXT: vn [[REG16:%v[0-9]+]], [[REG9]], [[REG11]] +; CHECK-Z14-NEXT: vn [[REG17:%v[0-9]+]], [[REG8]], [[REG10]] +; CHECK-Z14-NEXT: vsel %v24, [[REG15]], [[REG14]], [[REG17]] +; CHECK-Z14-NEXT: vsel %v26, [[REG13]], [[REG12]], [[REG16]] ; CHECK-Z14-NEXT: br %r14 %cmp0 = fcmp ogt <8 x float> %val1, %val2 %cmp1 = fcmp ogt <8 x double> %val3, %val4 @@ -787,9 +787,9 @@ ; CHECK-Z14: # BB#0: ; CHECK-Z14-NEXT: vfchdb %v0, %v26, %v30 ; CHECK-Z14-NEXT: vfchdb %v1, %v24, %v28 -; CHECK-Z14-NEXT: vpkg %v0, %v1, %v0 -; CHECK-Z14-NEXT: vfchsb %v1, %v25, %v27 -; CHECK-Z14-NEXT: vn %v0, %v0, %v1 +; CHECK-Z14-DAG: vpkg %v0, %v1, %v0 +; CHECK-Z14-DAG: vfchsb [[REG0:%v[0-9]+]], %v25, %v27 +; CHECK-Z14-NEXT: vn %v0, %v0, [[REG0]] ; CHECK-Z14-NEXT: vsel %v24, %v29, %v31, %v0 ; CHECK-Z14-NEXT: br %r14 %cmp0 = fcmp ogt <4 x double> %val1, %val2 @@ -828,18 +828,18 @@ ; ; CHECK-Z14-LABEL: fun34: ; CHECK-Z14: # BB#0: -; CHECK-Z14-NEXT: vfchsb %v4, %v25, %v27 -; CHECK-Z14-NEXT: vuphf %v5, %v4 -; CHECK-Z14-NEXT: vmrlg %v4, %v4, %v4 -; CHECK-Z14-NEXT: vfchdb %v2, %v24, %v28 -; CHECK-Z14-NEXT: vfchdb %v3, %v26, %v30 -; CHECK-Z14-NEXT: vuphf %v4, %v4 -; CHECK-Z14-NEXT: vl %v0, 176(%r15) -; CHECK-Z14-NEXT: vl %v1, 160(%r15) -; CHECK-Z14-NEXT: vn %v3, %v3, %v4 -; CHECK-Z14-NEXT: vn %v2, %v2, %v5 -; CHECK-Z14-NEXT: vsel %v24, %v29, %v1, %v2 -; CHECK-Z14-NEXT: vsel %v26, %v31, %v0, %v3 +; CHECK-Z14-NEXT: vfchsb [[REG0:%v[0-9]+]], %v25, %v27 +; CHECK-Z14-NEXT: vuphf %v5, [[REG0]] +; CHECK-Z14-NEXT: vmrlg [[REG0]], [[REG0]], [[REG0]] +; CHECK-Z14-NEXT: vfchdb [[REG1:%v[0-9]+]], %v24, %v28 +; CHECK-Z14-NEXT: vfchdb [[REG2:%v[0-9]+]], %v26, %v30 +; CHECK-Z14-NEXT: vuphf [[REG0]], [[REG0]] +; CHECK-Z14-NEXT: vl [[REG3:%v[0-9]+]], 176(%r15) +; CHECK-Z14-NEXT: vl [[REG4:%v[0-9]+]], 160(%r15) +; CHECK-Z14-NEXT: vn [[REG5:%v[0-9]+]], [[REG2]], [[REG0]] +; CHECK-Z14-NEXT: vn [[REG6:%v[0-9]+]], [[REG1]], %v5 +; CHECK-Z14-NEXT: vsel %v24, %v29, [[REG4]], [[REG6]] +; CHECK-Z14-NEXT: vsel %v26, %v31, [[REG3]], [[REG5]] ; CHECK-Z14-NEXT: br %r14 %cmp0 = fcmp ogt <4 x double> %val1, %val2 %cmp1 = fcmp ogt <4 x float> %val3, %val4