Index: include/llvm/CodeGen/MachineScheduler.h =================================================================== --- include/llvm/CodeGen/MachineScheduler.h +++ include/llvm/CodeGen/MachineScheduler.h @@ -789,7 +789,7 @@ enum CandReason : uint8_t { NoCand, Only1, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, - TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder}; + TopDepthReduce, TopPathReduce, NextDefUse, PhysRegCp2, NodeOrder}; #ifndef NDEBUG static const char *getReasonStr(GenericSchedulerBase::CandReason Reason); Index: lib/CodeGen/MachineScheduler.cpp =================================================================== --- lib/CodeGen/MachineScheduler.cpp +++ lib/CodeGen/MachineScheduler.cpp @@ -1662,6 +1662,40 @@ } // end namespace llvm +// Check if MI (which is expected to be a COPY) has as one of its operands a +// physical register that could be allocated to the other operands virtual +// register. +static bool isCoalescablePRegCopy(const MachineInstr &CopyMI, + const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + assert(CopyMI.isCopy() && "Expected a COPY"); + + const MachineOperand *PRegMO = nullptr; + const MachineOperand *VRegMO = nullptr; + if (TargetRegisterInfo::isPhysicalRegister(CopyMI.getOperand(0).getReg())) { + PRegMO = &CopyMI.getOperand(0); + VRegMO = &CopyMI.getOperand(1); + } else { + PRegMO = &CopyMI.getOperand(1); + VRegMO = &CopyMI.getOperand(0); + } + if (!TargetRegisterInfo::isPhysicalRegister(PRegMO->getReg()) || + !TargetRegisterInfo::isVirtualRegister(VRegMO->getReg())) + return false; + + MCPhysReg PhysReg = PRegMO->getReg(); + unsigned VirtReg = VRegMO->getReg(); + unsigned VSub = VRegMO->getSubReg(); + + const TargetRegisterClass *VirtRC = MRI.getRegClass(VirtReg); + return !MRI.isReserved(PhysReg) && + (VirtRC->contains(PhysReg) || + (VSub && TRI.getMatchingSuperReg(PhysReg, VSub, VirtRC))); +} + +// EXPERIMENTAL +static cl::opt COPY_CONSTRAIN_CHECK("copy-constrain-check", cl::init(true)); + /// constrainLocalCopy handles two possibilities: /// 1) Local src: /// I0: = dst @@ -1685,6 +1719,10 @@ LiveIntervals *LIS = DAG->getLIS(); MachineInstr *Copy = CopySU->getInstr(); + const MachineFunction *MF = Copy->getParent()->getParent(); + const MachineRegisterInfo *MRI = &MF->getRegInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + // Check for pure vreg copies. const MachineOperand &SrcOp = Copy->getOperand(1); unsigned SrcReg = SrcOp.getReg(); @@ -1696,6 +1734,30 @@ if (!TargetRegisterInfo::isVirtualRegister(DstReg) || DstOp.isDead()) return; +if (COPY_CONSTRAIN_CHECK) { + // Check for connected phys regs. + // * Make sure SrcReg was not defined by a COPY of phys-reg in the region. + for (MachineInstr &DI : MRI->def_instructions(SrcReg)) + if (DAG->getSUnit(&DI) && DI.isCopy() && + isCoalescablePRegCopy(DI, *TRI, *MRI)) + return; + + // * Make sure DstReg users are not producing values copied to phys-regs. + for (MachineInstr &DstRegUseMI : MRI->use_nodbg_instructions(DstReg)) { + if (DAG->getSUnit(&DstRegUseMI) == nullptr) + continue; + const MachineOperand &DefMO = DstRegUseMI.getOperand(0); + unsigned DefReg = (DefMO.isReg() && DefMO.isDef()) ? DefMO.getReg() : 0; + if (!DefReg) + continue; + for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) { + if (UseMI.isCopy() && DAG->getSUnit(&UseMI) != nullptr && + isCoalescablePRegCopy(UseMI, *TRI, *MRI)) + return; + } + } +} + // Check if either the dest or source is local. If it's live across a back // edge, it's not local. Note that if both vregs are live across the back // edge, we cannot successfully contrain the copy without cyclic scheduling. @@ -2503,6 +2565,7 @@ case BotHeightReduce:return "BOT-HEIGHT"; case BotPathReduce: return "BOT-PATH "; case NextDefUse: return "DEF-USE "; + case PhysRegCp2: return "PREG-CP-2 "; case NodeOrder: return "ORDER "; }; llvm_unreachable("Unknown reason!"); @@ -2843,6 +2906,96 @@ return 0; } +// EXPERIMENTAL +static cl::opt BEFORE_WEAK("cp-2-before-weak", cl::init(false)); +static cl::opt COUNT_DEF("cp-2-count-def", cl::init(true)); + +/// Find the copy-connected physregs of \p SU and the heuristical sum of them +/// (for a top region, it is incremented for a use and decremented for a +/// def). +static int findConnectedPhysRegs(const SUnit &SU, bool isTop, ScheduleDAGMILive *DAG) { + const MachineInstr *MI = SU.getInstr(); + const MachineBasicBlock *MBB = MI->getParent(); + const MachineFunction *MF = MBB->getParent(); + const MachineRegisterInfo *MRI = &MF->getRegInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + + // Don't move a copy of a subreg away from the point of definition. + if (!MI->getNumOperands() || + (MI->isCopy() && MI->getOperand(1).getSubReg())) + return 0; + + int NumCopied = 0; + + // Check if there is a local coalescable phys-reg COPY that uses the result + // of \p SU. Return 0 if there is any other kind of use. + bool PhysRegDefUse = false; + const MachineOperand &MO = MI->getOperand(0); + if (MO.isReg() && MO.isDef()) + for (MachineInstr &Use : MRI->use_nodbg_instructions(MO.getReg())) { + if (Use.isCopy() && Use.getParent() == MI->getParent() && + isCoalescablePRegCopy(Use, *TRI, *MRI)) + PhysRegDefUse = true; + else + return 0; + } + if (PhysRegDefUse && COUNT_DEF) + NumCopied += (isTop ? -1 : 1); + + for (const MachineOperand &MO : MI->uses()) { + if (!MO.isReg()) + continue; + + // Check if there is a local coalescable phys-reg COPY that defines the + // used register. + bool LocalPhysRegUseDef = false; + for (MachineInstr &DI : MRI->def_instructions(MO.getReg())) + if (DI.isCopy() && DI.getParent() == MI->getParent() && + isCoalescablePRegCopy(DI, *TRI, *MRI)) { + LocalPhysRegUseDef = true; + break; + } + if (!LocalPhysRegUseDef) + continue; + + // Check if all users are local. + bool AllUsesLocal = true; + for (MachineInstr &Use : MRI->use_nodbg_instructions(MO.getReg())) { + if (Use.getParent() != MBB) { + AllUsesLocal = false; + break; + } + } + if (!AllUsesLocal) + continue; + + NumCopied += (isTop ? 1 : -1); + } + + return NumCopied; +} + +/// Minimize the (virtual) live ranges of copies involving phys-regs. In +/// regions with both incoming and outgoing arguments, this will reduce the +/// risk of overlapping live ranges that will hinder coalescing. In contrast +/// to biasPhysRegCopy(), this does not typically handle COPYs, but rather +/// instructions connected to a COPY involving a phys-reg. +static bool tryPhysRegCopies2(GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand, + ScheduleDAGMILive *DAG) { + assert(TryCand.AtTop == Cand.AtTop && + "Expected two candidates from same boundary."); + + // Find the copy-connected physregs. + int TryCandCount = findConnectedPhysRegs(*TryCand.SU, TryCand.AtTop, DAG); + int CandCount = findConnectedPhysRegs(*Cand.SU, Cand.AtTop, DAG); + + // Prefer the candidate that has the most(top) / least(bot) number of copied + // phys-regs, to minimize those live-ranges. + return tryGreater(TryCandCount, CandCount, + TryCand, Cand, GenericSchedulerBase::PhysRegCp2); +} + void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, @@ -2954,6 +3107,10 @@ return; if (SameBoundary) { + if (BEFORE_WEAK) + if (tryPhysRegCopies2(TryCand, Cand, DAG)) + return; + // Weak edges are for clustering and other constraints. if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), getWeakLeft(Cand.SU, Cand.AtTop), @@ -2985,6 +3142,11 @@ !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) return; + // Try to minimize live ranges of copied physregs. + if (!BEFORE_WEAK) + if (tryPhysRegCopies2(TryCand, Cand, DAG)) + return; + // Fall through to original instruction order. if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { Index: test/CodeGen/SystemZ/alloca-01.ll =================================================================== --- test/CodeGen/SystemZ/alloca-01.ll +++ test/CodeGen/SystemZ/alloca-01.ll @@ -13,10 +13,6 @@ ; Allocate %length bytes and take addresses based on the result. ; There are two stack arguments, so an offset of 160 + 2 * 8 == 176 ; is added to the copy of %r15. -; -; NOTE: 'la %r0, 177(%r1)' is actually an expected fail as it would -; be better (and possible) to load into %r3 directly. -; define i64 @f1(i64 %length, i64 %index) { ; FIXME: a better sequence would be: ; @@ -38,7 +34,7 @@ ; ; CHECK-B-LABEL: f1: ; CHECK-B: lgr %r15, %r1 -; CHECK-B: la %r0, 177(%r1) +; CHECK-B: la %r3, 177(%r1) ; ; CHECK-C-LABEL: f1: ; CHECK-C: lgr %r15, %r1 Index: test/CodeGen/SystemZ/args-06.ll =================================================================== --- test/CodeGen/SystemZ/args-06.ll +++ test/CodeGen/SystemZ/args-06.ll @@ -5,10 +5,10 @@ define i8 @f1(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g) { ; CHECK-LABEL: f1: -; CHECK: lb {{%r[0-5]}}, 175(%r15) -; CHECK: lb {{%r[0-5]}}, 167(%r15) -; CHECK: ar %r2, %r3 -; CHECK: ar %r2, %r4 +; CHECK-DAG: lb {{%r[0-5]}}, 175(%r15) +; CHECK-DAG: lb {{%r[0-5]}}, 167(%r15) +; CHECK-DAG: ar %r2, %r3 +; CHECK-DAG: ar %r2, %r4 ; CHECK: ar %r2, %r5 ; CHECK: ar %r2, %r6 ; CHECK: br %r14 Index: test/CodeGen/SystemZ/args-10.ll =================================================================== --- test/CodeGen/SystemZ/args-10.ll +++ test/CodeGen/SystemZ/args-10.ll @@ -8,7 +8,7 @@ ; CHECK-DAG: lg [[REGL:%r[0-5]+]], 8(%r6) ; CHECK-DAG: lg [[REGH:%r[0-5]+]], 0(%r6) ; CHECK: algr [[REGL]], [[REGL]] -; CHECK-NEXT: alcgr [[REGH]], [[REGH]] +; CHECK-DAG: alcgr [[REGH]], [[REGH]] ; CHECK-DAG: stg [[REGL]], 8(%r2) ; CHECK-DAG: stg [[REGH]], 0(%r2) ; CHECK: br %r14 @@ -25,7 +25,7 @@ ; CHECK-DAG: lg [[REGL:%r[0-5]+]], 8([[ADDR]]) ; CHECK-DAG: lg [[REGH:%r[0-5]+]], 0([[ADDR]]) ; CHECK: algr [[REGL]], [[REGL]] -; CHECK-NEXT: alcgr [[REGH]], [[REGH]] +; CHECK-DAG: alcgr [[REGH]], [[REGH]] ; CHECK-DAG: stg [[REGL]], 8(%r2) ; CHECK-DAG: stg [[REGH]], 0(%r2) ; CHECK: br %r14 @@ -40,7 +40,7 @@ ; CHECK-DAG: lg [[REGL:%r[0-5]+]], 8(%r3) ; CHECK-DAG: lg [[REGH:%r[0-5]+]], 0(%r3) ; CHECK: algr [[REGL]], [[REGL]] -; CHECK-NEXT: alcgr [[REGH]], [[REGH]] +; CHECK-DAG: alcgr [[REGH]], [[REGH]] ; CHECK-DAG: stg [[REGL]], 8(%r2) ; CHECK-DAG: stg [[REGH]], 0(%r2) ; CHECK: br %r14 Index: test/CodeGen/SystemZ/args-11.mir =================================================================== --- /dev/null +++ test/CodeGen/SystemZ/args-11.mir @@ -0,0 +1,102 @@ +# RUN: llc -o - %s -mtriple=s390x-linux-gnu -run-pass=machine-scheduler \ +# RUN: -debug-only=machine-scheduler 2>&1 | FileCheck %s + +# Test that an extra COPY due to poor pre-RA scheduling is avoided. This +# would happen if the '%7 = ADJDYNALLOC ...' is scheduled above the three +# ADJDYNALLOCs using %1 (%r3d dependency). + +--- | + + declare i64 @bar(i8*, i8*, i8*, i8*, i8*, i64, i64) + + define i64 @f1(i64 %length, i64 %index) { + %a = alloca i8, i64 %length + %b = getelementptr i8, i8* %a, i64 1 + %cindex = add i64 %index, 3919 + %c = getelementptr i8, i8* %a, i64 %cindex + %dindex = add i64 %index, 3920 + %d = getelementptr i8, i8* %a, i64 %dindex + %eindex = add i64 %index, 4095 + %e = getelementptr i8, i8* %a, i64 %eindex + %count = call i64 @bar(i8* %a, i8* %b, i8* %c, i8* %d, i8* %e, i64 0, i64 0) + %res = add i64 %count, 1 + ret i64 %res + } + +... + +# CHECK: ********** MI Scheduling ********** +# CHECK: f1:%bb.0 +# CHECK: From: %7:gr64bit = ADJDYNALLOC %5:addr64bit, 1, $noreg +# CHECK: To: CallBRASL +# CHECK: *** Final schedule for %bb.0 *** +# CHECK: SU(1): %8:gr64bit = ADJDYNALLOC %5:addr64bit, 3919, %1:addr64bit +# CHECK: SU(2): %9:gr64bit = ADJDYNALLOC %5:addr64bit, 3920, %1:addr64bit +# CHECK: SU(3): %10:gr64bit = ADJDYNALLOC %5:addr64bit, 4095, %1:addr64bit +# CHECK: SU(0): %7:gr64bit = ADJDYNALLOC %5:addr64bit, 1, $noreg +# +# CHECK: ********** MI Scheduling ********** +# CHECK: f1:%bb.0 +# CHECK: From: %1:addr64bit = COPY $r3d +# CHECK: To: $r15d = COPY %5:addr64bit + + +--- +name: f1 +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: addr64bit } + - { id: 1, class: addr64bit } + - { id: 2, class: gr64bit } + - { id: 3, class: gr64bit } + - { id: 4, class: gr64bit } + - { id: 5, class: addr64bit } + - { id: 6, class: gr64bit } + - { id: 7, class: gr64bit } + - { id: 8, class: gr64bit } + - { id: 9, class: gr64bit } + - { id: 10, class: gr64bit } + - { id: 11, class: addr64bit } + - { id: 12, class: addr64bit } + - { id: 13, class: gr64bit } +liveins: + - { reg: '$r2d', virtual-reg: '%0' } + - { reg: '$r3d', virtual-reg: '%1' } +frameInfo: + maxAlignment: 8 + hasCalls: true +stack: + - { id: 0, name: a, type: variable-sized, alignment: 1, stack-id: 0 } +body: | + bb.0 (%ir-block.0): + liveins: $r2d, $r3d + + %1 = COPY $r3d + %0 = COPY $r2d + %3 = LA %0, 7, _ + %3 = NILL64 %3, 65528, implicit-def dead $cc + %5 = COPY $r15d + %5 = SGR %5, %3, implicit-def dead $cc + %6 = ADJDYNALLOC %5, 0, _ + $r15d = COPY %5 + %7 = ADJDYNALLOC %5, 1, _ + %8 = ADJDYNALLOC %5, 3919, %1 + %9 = ADJDYNALLOC %5, 3920, %1 + %10 = ADJDYNALLOC %5, 4095, %1 + ADJCALLSTACKDOWN 16, 0 + MVGHI $r15d, 168, 0 :: (store 8) + MVGHI $r15d, 160, 0 :: (store 8) + $r2d = COPY %6 + $r3d = COPY %7 + $r4d = COPY %8 + $r5d = COPY %9 + $r6d = COPY %10 + CallBRASL @bar, $r2d, $r3d, killed $r4d, killed $r5d, killed $r6d, csr_systemz, implicit-def dead $r14d, implicit-def dead $cc, implicit-def $r2d + ADJCALLSTACKUP 16, 0 + %12 = COPY $r2d + %13 = LA %12, 1, _ + $r2d = COPY %13 + Return implicit $r2d + +... Index: test/CodeGen/SystemZ/cond-move-02.ll =================================================================== --- test/CodeGen/SystemZ/cond-move-02.ll +++ test/CodeGen/SystemZ/cond-move-02.ll @@ -4,8 +4,8 @@ define i32 @f1(i32 %x) { ; CHECK-LABEL: f1: -; CHECK: lhi [[REG:%r[0-5]]], 0 -; CHECK: chi %r2, 0 +; CHECK-DAG: lhi [[REG:%r[0-5]]], 0 +; CHECK-DAG: chi %r2, 0 ; CHECK: lochilh [[REG]], 42 ; CHECK: br %r14 %cond = icmp ne i32 %x, 0 @@ -35,8 +35,8 @@ define i64 @f4(i64 %x) { ; CHECK-LABEL: f4: -; CHECK: lghi [[REG:%r[0-5]]], 0 -; CHECK: cghi %r2, 0 +; CHECK-DAG: lghi [[REG:%r[0-5]]], 0 +; CHECK-DAG: cghi %r2, 0 ; CHECK: locghilh [[REG]], 42 ; CHECK: br %r14 %cond = icmp ne i64 %x, 0 Index: test/CodeGen/SystemZ/risbg-01.ll =================================================================== --- test/CodeGen/SystemZ/risbg-01.ll +++ test/CodeGen/SystemZ/risbg-01.ll @@ -233,11 +233,9 @@ ; Now try an arithmetic right shift in which the sign bits aren't needed. ; Introduce a second use of %shr so that the ashr doesn't decompose to ; an lshr. -; NOTE: the extra move to %r2 should not be needed (temporary FAIL) define i32 @f21(i32 %foo, i32 *%dest) { ; CHECK-LABEL: f21: -; CHECK: risbg %r0, %r2, 60, 190, 36 -; CHECK: lr %r2, %r0 +; CHECK: risbg %r2, %r2, 60, 190, 36 ; CHECK: br %r14 %shr = ashr i32 %foo, 28 store i32 %shr, i32 *%dest Index: test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll =================================================================== --- test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll +++ test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll @@ -64,7 +64,7 @@ ; CHECK-DAG: vceqh [[REG4:%v[0-9]+]], %v30, %v27 ; CHECK-DAG: vl [[REG5:%v[0-9]+]], 176(%r15) ; CHECK-DAG: vl [[REG6:%v[0-9]+]], 160(%r15) -; CHECK-DAG: vo [[REG7:%v[0-9]+]], %v2, [[REG4]] +; CHECK-DAG: vo [[REG7:%v[0-9]+]], [[REG1]], [[REG4]] ; CHECK-DAG: vo [[REG8:%v[0-9]+]], [[REG2]], [[REG3]] ; CHECK-DAG: vsel %v24, %v29, [[REG6]], [[REG8]] ; CHECK-DAG: vsel %v26, %v31, [[REG5]], [[REG7]] @@ -185,7 +185,7 @@ ; CHECK-DAG: vceqh [[REG0:%v[0-9]+]], %v24, %v26 ; CHECK-DAG: vceqh [[REG1:%v[0-9]+]], %v28, %v30 ; CHECK-NEXT: vx [[REG2:%v[0-9]+]], [[REG0]], [[REG1]] -; CHECK-DAG: vuphh [[REG3:%v[0-9]+]], [[REG2]] +; CHECK-NEXT: vuphh [[REG3:%v[0-9]+]], [[REG2]] ; CHECK-DAG: vmrlg [[REG4:%v[0-9]+]], [[REG2]], [[REG2]] ; CHECK-DAG: vuphh [[REG5:%v[0-9]+]], [[REG4]] ; CHECK-NEXT: vsel %v24, %v25, %v29, [[REG3]] @@ -346,7 +346,7 @@ ; CHECK-NEXT: vceqf %v0, %v24, %v26 ; CHECK-NEXT: vuphh %v1, %v1 ; CHECK-NEXT: vn %v0, %v0, %v1 -; CHECK-DAG: vuphf [[REG0:%v[0-9]+]], %v0 +; CHECK-NEXT: vuphf [[REG0:%v[0-9]+]], %v0 ; CHECK-DAG: vmrlg [[REG1:%v[0-9]+]], %v0, %v0 ; CHECK-DAG: vuphf [[REG2:%v[0-9]+]], [[REG1]] ; CHECK-NEXT: vsel %v24, %v25, %v29, [[REG0]] @@ -439,9 +439,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vceqg %v0, %v26, %v30 ; CHECK-NEXT: vceqg %v1, %v24, %v28 -; CHECK-NEXT: vpkg %v0, %v1, %v0 -; CHECK-NEXT: vceqf %v1, %v25, %v27 -; CHECK-NEXT: vx %v0, %v0, %v1 +; CHECK-DAG: vpkg %v0, %v1, %v0 +; CHECK-DAG: vceqf [[REG0:%v[0-9]+]], %v25, %v27 +; CHECK-NEXT: vx %v0, %v0, [[REG0]] ; CHECK-NEXT: vsel %v24, %v29, %v31, %v0 ; CHECK-NEXT: br %r14 %cmp0 = icmp eq <4 x i64> %val1, %val2 @@ -477,20 +477,20 @@ define <2 x float> @fun25(<2 x float> %val1, <2 x float> %val2, <2 x double> %val3, <2 x double> %val4, <2 x float> %val5, <2 x float> %val6) { ; CHECK-LABEL: fun25: ; CHECK: # %bb.0: -; CHECK-NEXT: vmrlf %v0, %v26, %v26 -; CHECK-NEXT: vmrlf %v1, %v24, %v24 -; CHECK-NEXT: vldeb %v0, %v0 -; CHECK-NEXT: vldeb %v1, %v1 -; CHECK-NEXT: vfchdb %v0, %v1, %v0 -; CHECK-NEXT: vmrhf %v1, %v26, %v26 -; CHECK-NEXT: vmrhf %v2, %v24, %v24 -; CHECK-NEXT: vldeb %v1, %v1 -; CHECK-NEXT: vldeb %v2, %v2 -; CHECK-NEXT: vfchdb %v1, %v2, %v1 -; CHECK-NEXT: vpkg %v0, %v1, %v0 -; CHECK-NEXT: vfchdb %v1, %v28, %v30 -; CHECK-NEXT: vpkg %v1, %v1, %v1 -; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vmrlf [[REG0:%v[0-9]+]], %v26, %v26 +; CHECK-NEXT: vmrlf [[REG1:%v[0-9]+]], %v24, %v24 +; CHECK-NEXT: vldeb [[REG0]], [[REG0]] +; CHECK-NEXT: vldeb [[REG1]], [[REG1]] +; CHECK-NEXT: vfchdb [[REG5:%v[0-9]+]], [[REG1]], [[REG0]] +; CHECK-NEXT: vmrhf [[REG2:%v[0-9]+]], %v26, %v26 +; CHECK-NEXT: vmrhf [[REG3:%v[0-9]+]], %v24, %v24 +; CHECK-NEXT: vldeb [[REG2]], [[REG2]] +; CHECK-DAG: vldeb [[REG3]], [[REG3]] +; CHECK-DAG: vfchdb [[REG4:%v[0-9]+]], [[REG3]], [[REG2]] +; CHECK-DAG: vpkg [[REG7:%v[0-9]+]], [[REG4]], [[REG5]] +; CHECK-DAG: vfchdb [[REG6:%v[0-9]+]], %v28, %v30 +; CHECK-DAG: vpkg [[REG6]], [[REG6]], [[REG6]] +; CHECK-NEXT: vo %v0, [[REG7]], [[REG6]] ; CHECK-NEXT: vsel %v24, %v25, %v27, %v0 ; CHECK-NEXT: br %r14 ; @@ -512,28 +512,28 @@ define <2 x double> @fun26(<2 x float> %val1, <2 x float> %val2, <2 x double> %val3, <2 x double> %val4, <2 x double> %val5, <2 x double> %val6) { ; CHECK-LABEL: fun26: ; CHECK: # %bb.0: -; CHECK-NEXT: vmrlf %v0, %v26, %v26 -; CHECK-NEXT: vmrlf %v1, %v24, %v24 -; CHECK-NEXT: vldeb %v0, %v0 -; CHECK-NEXT: vldeb %v1, %v1 -; CHECK-NEXT: vfchdb %v0, %v1, %v0 -; CHECK-NEXT: vmrhf %v1, %v26, %v26 -; CHECK-NEXT: vmrhf %v2, %v24, %v24 -; CHECK-NEXT: vldeb %v1, %v1 -; CHECK-NEXT: vldeb %v2, %v2 -; CHECK-NEXT: vfchdb %v1, %v2, %v1 -; CHECK-NEXT: vpkg %v0, %v1, %v0 -; CHECK-NEXT: vuphf %v0, %v0 -; CHECK-NEXT: vfchdb %v1, %v28, %v30 -; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vmrlf [[REG0:%v[0-9]+]], %v26, %v26 +; CHECK-NEXT: vmrlf [[REG1:%v[0-9]+]], %v24, %v24 +; CHECK-NEXT: vldeb [[REG0]], [[REG0]] +; CHECK-NEXT: vldeb [[REG1]], [[REG1]] +; CHECK-NEXT: vfchdb [[REG2:%v[0-9]+]], [[REG1]], [[REG0]] +; CHECK-NEXT: vmrhf [[REG3:%v[0-9]+]], %v26, %v26 +; CHECK-NEXT: vmrhf [[REG4:%v[0-9]+]], %v24, %v24 +; CHECK-NEXT: vldeb [[REG3]], [[REG3]] +; CHECK-DAG: vldeb [[REG4]], [[REG4]] +; CHECK-DAG: vfchdb [[REG5:%v[0-9]+]], [[REG4]], [[REG3]] +; CHECK-DAG: vpkg [[REG6:%v[0-9]+]], [[REG5]], [[REG2]] +; CHECK-DAG: vuphf [[REG6]], [[REG6]] +; CHECK-DAG: vfchdb [[REG7:%v[0-9]+]], %v28, %v30 +; CHECK-NEXT: vo %v0, [[REG6]], [[REG7]] ; CHECK-NEXT: vsel %v24, %v25, %v27, %v0 ; CHECK-NEXT: br %r14 ; ; CHECK-Z14-LABEL: fun26: ; CHECK-Z14: # %bb.0: ; CHECK-Z14-NEXT: vfchsb %v0, %v24, %v26 -; CHECK-Z14-NEXT: vuphf %v0, %v0 -; CHECK-Z14-NEXT: vfchdb %v1, %v28, %v30 +; CHECK-Z14-DAG: vuphf %v0, %v0 +; CHECK-Z14-DAG: vfchdb %v1, %v28, %v30 ; CHECK-Z14-NEXT: vo %v0, %v0, %v1 ; CHECK-Z14-NEXT: vsel %v24, %v25, %v27, %v0 ; CHECK-Z14-NEXT: br %r14 @@ -630,12 +630,12 @@ ; CHECK-NEXT: vldeb %v3, %v3 ; CHECK-NEXT: vfchdb %v2, %v3, %v2 ; CHECK-NEXT: vpkg %v1, %v2, %v1 -; CHECK-NEXT: vx %v0, %v0, %v1 -; CHECK-NEXT: vmrlg %v1, %v0, %v0 -; CHECK-NEXT: vuphf %v1, %v1 -; CHECK-NEXT: vuphf %v0, %v0 -; CHECK-NEXT: vsel %v24, %v25, %v29, %v0 -; CHECK-NEXT: vsel %v26, %v27, %v31, %v1 +; CHECK-NEXT: vx [[REG1:%v[0-9]+]], %v0, %v1 +; CHECK-DAG: vmrlg [[REG0:%v[0-9]+]], [[REG1]], [[REG1]] +; CHECK-DAG: vuphf [[REG2:%v[0-9]+]], [[REG1]] +; CHECK-DAG: vuphf [[REG0]], [[REG0]] +; CHECK-NEXT: vsel %v24, %v25, %v29, [[REG2]] +; CHECK-NEXT: vsel %v26, %v27, %v31, [[REG0]] ; CHECK-NEXT: br %r14 ; ; CHECK-Z14-LABEL: fun29: @@ -643,11 +643,11 @@ ; CHECK-Z14-NEXT: vfchsb %v0, %v24, %v26 ; CHECK-Z14-NEXT: vfchsb %v1, %v28, %v30 ; CHECK-Z14-NEXT: vx %v0, %v0, %v1 -; CHECK-Z14-NEXT: vmrlg %v1, %v0, %v0 -; CHECK-Z14-NEXT: vuphf %v1, %v1 -; CHECK-Z14-NEXT: vuphf %v0, %v0 -; CHECK-Z14-NEXT: vsel %v24, %v25, %v29, %v0 -; CHECK-Z14-NEXT: vsel %v26, %v27, %v31, %v1 +; CHECK-Z14-DAG: vmrlg [[REG0:%v[0-9]+]], %v0, %v0 +; CHECK-Z14-DAG: vuphf [[REG0]], [[REG0]] +; CHECK-Z14-DAG: vuphf [[REG1:%v[0-9]+]], %v0 +; CHECK-Z14-NEXT: vsel %v24, %v25, %v29, [[REG1]] +; CHECK-Z14-NEXT: vsel %v26, %v27, %v31, [[REG0]] ; CHECK-Z14-NEXT: br %r14 %cmp0 = fcmp ogt <4 x float> %val1, %val2 %cmp1 = fcmp ogt <4 x float> %val3, %val4 @@ -667,28 +667,28 @@ ; CHECK-NEXT: vmrhf %v17, %v28, %v28 ; CHECK-NEXT: vmrhf %v18, %v24, %v24 ; CHECK-NEXT: vldeb %v17, %v17 -; CHECK-NEXT: vl %v4, 192(%r15) -; CHECK-NEXT: vldeb %v18, %v18 -; CHECK-NEXT: vl %v5, 208(%r15) -; CHECK-NEXT: vl %v6, 160(%r15) -; CHECK-NEXT: vl %v7, 176(%r15) -; CHECK-NEXT: vl %v0, 272(%r15) -; CHECK-NEXT: vl %v1, 240(%r15) -; CHECK-NEXT: vfchdb %v17, %v18, %v17 -; CHECK-NEXT: vl %v2, 256(%r15) -; CHECK-NEXT: vl %v3, 224(%r15) +; CHECK-DAG: vl [[REG0:%v[0-9]+]], 192(%r15) +; CHECK-DAG: vldeb %v18, %v18 +; CHECK-DAG: vl [[REG1:%v[0-9]+]], 208(%r15) +; CHECK-DAG: vl [[REG2:%v[0-9]+]], 160(%r15) +; CHECK-DAG: vl [[REG3:%v[0-9]+]], 176(%r15) +; CHECK-DAG: vl [[REG4:%v[0-9]+]], 272(%r15) +; CHECK-DAG: vl [[REG5:%v[0-9]+]], 240(%r15) +; CHECK-DAG: vfchdb %v17, %v18, %v17 +; CHECK-DAG: vl [[REG6:%v[0-9]+]], 256(%r15) +; CHECK-DAG: vl [[REG7:%v[0-9]+]], 224(%r15) ; CHECK-NEXT: vpkg %v16, %v17, %v16 ; CHECK-NEXT: vmrlf %v17, %v30, %v30 ; CHECK-NEXT: vmrlf %v18, %v26, %v26 ; CHECK-NEXT: vmrhf %v19, %v26, %v26 -; CHECK-NEXT: vfchdb %v7, %v27, %v7 -; CHECK-NEXT: vfchdb %v6, %v25, %v6 -; CHECK-NEXT: vfchdb %v5, %v31, %v5 -; CHECK-NEXT: vfchdb %v4, %v29, %v4 -; CHECK-NEXT: vpkg %v6, %v6, %v7 -; CHECK-NEXT: vpkg %v4, %v4, %v5 -; CHECK-NEXT: vn %v5, %v16, %v6 -; CHECK-NEXT: vsel %v24, %v3, %v2, %v5 +; CHECK-NEXT: vfchdb [[REG8:%v[0-9]+]], %v27, [[REG3]] +; CHECK-NEXT: vfchdb [[REG9:%v[0-9]+]], %v25, [[REG2]] +; CHECK-NEXT: vfchdb [[REG10:%v[0-9]+]], %v31, [[REG1]] +; CHECK-NEXT: vfchdb [[REG11:%v[0-9]+]], %v29, [[REG0]] +; CHECK-NEXT: vpkg [[REG12:%v[0-9]+]], [[REG9]], [[REG8]] +; CHECK-NEXT: vpkg [[REG13:%v[0-9]+]], [[REG11]], [[REG10]] +; CHECK-NEXT: vn [[REG14:%v[0-9]+]], %v16, [[REG12]] +; CHECK-NEXT: vsel %v24, [[REG7]], [[REG6]], [[REG14]] ; CHECK-NEXT: vldeb %v17, %v17 ; CHECK-NEXT: vldeb %v18, %v18 ; CHECK-NEXT: vfchdb %v17, %v18, %v17 @@ -697,32 +697,32 @@ ; CHECK-NEXT: vldeb %v19, %v19 ; CHECK-NEXT: vfchdb %v18, %v19, %v18 ; CHECK-NEXT: vpkg %v17, %v18, %v17 -; CHECK-NEXT: vn %v4, %v17, %v4 -; CHECK-NEXT: vsel %v26, %v1, %v0, %v4 +; CHECK-NEXT: vn [[REG15:%v[0-9]+]], %v17, [[REG13]] +; CHECK-NEXT: vsel %v26, [[REG5]], [[REG4]], [[REG15]] ; CHECK-NEXT: br %r14 ; ; CHECK-Z14-LABEL: fun30: ; CHECK-Z14: # %bb.0: -; CHECK-Z14-NEXT: vl %v4, 192(%r15) -; CHECK-Z14-NEXT: vl %v5, 208(%r15) -; CHECK-Z14-NEXT: vl %v6, 160(%r15) -; CHECK-Z14-NEXT: vl %v7, 176(%r15) -; CHECK-Z14-NEXT: vfchdb %v7, %v27, %v7 -; CHECK-Z14-NEXT: vfchdb %v6, %v25, %v6 -; CHECK-Z14-NEXT: vfchdb %v5, %v31, %v5 -; CHECK-Z14-NEXT: vfchdb %v4, %v29, %v4 -; CHECK-Z14-NEXT: vfchsb %v16, %v24, %v28 -; CHECK-Z14-NEXT: vfchsb %v17, %v26, %v30 -; CHECK-Z14-NEXT: vpkg %v6, %v6, %v7 -; CHECK-Z14-NEXT: vpkg %v4, %v4, %v5 -; CHECK-Z14-NEXT: vl %v0, 272(%r15) -; CHECK-Z14-NEXT: vl %v1, 240(%r15) -; CHECK-Z14-NEXT: vl %v2, 256(%r15) -; CHECK-Z14-NEXT: vl %v3, 224(%r15) -; CHECK-Z14-NEXT: vn %v4, %v17, %v4 -; CHECK-Z14-NEXT: vn %v5, %v16, %v6 -; CHECK-Z14-NEXT: vsel %v24, %v3, %v2, %v5 -; CHECK-Z14-NEXT: vsel %v26, %v1, %v0, %v4 +; CHECK-Z14-DAG: vl [[REG0:%v[0-9]+]], 192(%r15) +; CHECK-Z14-DAG: vl [[REG1:%v[0-9]+]], 208(%r15) +; CHECK-Z14-DAG: vl [[REG2:%v[0-9]+]], 160(%r15) +; CHECK-Z14-DAG: vl [[REG3:%v[0-9]+]], 176(%r15) +; CHECK-Z14-NEXT: vfchdb [[REG4:%v[0-9]+]], %v27, [[REG3]] +; CHECK-Z14-NEXT: vfchdb [[REG5:%v[0-9]+]], %v25, [[REG2]] +; CHECK-Z14-NEXT: vfchdb [[REG6:%v[0-9]+]], %v31, [[REG1]] +; CHECK-Z14-NEXT: vfchdb [[REG7:%v[0-9]+]], %v29, [[REG0]] +; CHECK-Z14-NEXT: vfchsb [[REG8:%v[0-9]+]], %v24, %v28 +; CHECK-Z14-NEXT: vfchsb [[REG9:%v[0-9]+]], %v26, %v30 +; CHECK-Z14-NEXT: vpkg [[REG10:%v[0-9]+]], [[REG5]], [[REG4]] +; CHECK-Z14-NEXT: vpkg [[REG11:%v[0-9]+]], [[REG7]], [[REG6]] +; CHECK-Z14-NEXT: vl [[REG12:%v[0-9]+]], 272(%r15) +; CHECK-Z14-NEXT: vl [[REG13:%v[0-9]+]], 240(%r15) +; CHECK-Z14-NEXT: vl [[REG14:%v[0-9]+]], 256(%r15) +; CHECK-Z14-NEXT: vl [[REG15:%v[0-9]+]], 224(%r15) +; CHECK-Z14-NEXT: vn [[REG16:%v[0-9]+]], [[REG9]], [[REG11]] +; CHECK-Z14-NEXT: vn [[REG17:%v[0-9]+]], [[REG8]], [[REG10]] +; CHECK-Z14-NEXT: vsel %v24, [[REG15]], [[REG14]], [[REG17]] +; CHECK-Z14-NEXT: vsel %v26, [[REG13]], [[REG12]], [[REG16]] ; CHECK-Z14-NEXT: br %r14 %cmp0 = fcmp ogt <8 x float> %val1, %val2 %cmp1 = fcmp ogt <8 x double> %val3, %val4 @@ -787,9 +787,9 @@ ; CHECK-Z14: # %bb.0: ; CHECK-Z14-NEXT: vfchdb %v0, %v26, %v30 ; CHECK-Z14-NEXT: vfchdb %v1, %v24, %v28 -; CHECK-Z14-NEXT: vpkg %v0, %v1, %v0 -; CHECK-Z14-NEXT: vfchsb %v1, %v25, %v27 -; CHECK-Z14-NEXT: vn %v0, %v0, %v1 +; CHECK-Z14-DAG: vpkg %v0, %v1, %v0 +; CHECK-Z14-DAG: vfchsb [[REG0:%v[0-9]+]], %v25, %v27 +; CHECK-Z14-NEXT: vn %v0, %v0, [[REG0]] ; CHECK-Z14-NEXT: vsel %v24, %v29, %v31, %v0 ; CHECK-Z14-NEXT: br %r14 %cmp0 = fcmp ogt <4 x double> %val1, %val2 @@ -828,18 +828,18 @@ ; ; CHECK-Z14-LABEL: fun34: ; CHECK-Z14: # %bb.0: -; CHECK-Z14-NEXT: vfchsb %v4, %v25, %v27 -; CHECK-Z14-NEXT: vuphf %v5, %v4 -; CHECK-Z14-NEXT: vmrlg %v4, %v4, %v4 -; CHECK-Z14-NEXT: vfchdb %v2, %v24, %v28 -; CHECK-Z14-NEXT: vfchdb %v3, %v26, %v30 -; CHECK-Z14-NEXT: vuphf %v4, %v4 -; CHECK-Z14-NEXT: vl %v0, 176(%r15) -; CHECK-Z14-NEXT: vl %v1, 160(%r15) -; CHECK-Z14-NEXT: vn %v3, %v3, %v4 -; CHECK-Z14-NEXT: vn %v2, %v2, %v5 -; CHECK-Z14-NEXT: vsel %v24, %v29, %v1, %v2 -; CHECK-Z14-NEXT: vsel %v26, %v31, %v0, %v3 +; CHECK-Z14-NEXT: vfchsb [[REG0:%v[0-9]+]], %v25, %v27 +; CHECK-Z14-NEXT: vuphf %v5, [[REG0]] +; CHECK-Z14-NEXT: vmrlg [[REG0]], [[REG0]], [[REG0]] +; CHECK-Z14-NEXT: vfchdb [[REG1:%v[0-9]+]], %v24, %v28 +; CHECK-Z14-NEXT: vfchdb [[REG2:%v[0-9]+]], %v26, %v30 +; CHECK-Z14-NEXT: vuphf [[REG0]], [[REG0]] +; CHECK-Z14-NEXT: vl [[REG3:%v[0-9]+]], 176(%r15) +; CHECK-Z14-NEXT: vl [[REG4:%v[0-9]+]], 160(%r15) +; CHECK-Z14-NEXT: vn [[REG5:%v[0-9]+]], [[REG2]], [[REG0]] +; CHECK-Z14-NEXT: vn [[REG6:%v[0-9]+]], [[REG1]], %v5 +; CHECK-Z14-NEXT: vsel %v24, %v29, [[REG4]], [[REG6]] +; CHECK-Z14-NEXT: vsel %v26, %v31, [[REG3]], [[REG5]] ; CHECK-Z14-NEXT: br %r14 %cmp0 = fcmp ogt <4 x double> %val1, %val2 %cmp1 = fcmp ogt <4 x float> %val3, %val4