Index: include/llvm/CodeGen/MachineScheduler.h =================================================================== --- include/llvm/CodeGen/MachineScheduler.h +++ include/llvm/CodeGen/MachineScheduler.h @@ -163,8 +163,12 @@ // first. bool DisableLatencyHeuristic; + // If true, try to use instructions which can fold a reload of a reg. + bool FoldableReloadHeuristic; + MachineSchedPolicy(): ShouldTrackPressure(false), ShouldTrackLaneMasks(false), - OnlyTopDown(false), OnlyBottomUp(false), DisableLatencyHeuristic(false) {} + OnlyTopDown(false), OnlyBottomUp(false), DisableLatencyHeuristic(false), + FoldableReloadHeuristic(false) {} }; /// MachineSchedStrategy - Interface to the scheduling algorithm used by @@ -196,6 +200,9 @@ /// Initialize the strategy after building the DAG for a new region. virtual void initialize(ScheduleDAGMI *DAG) = 0; + /// Tell strategy that a region is done, so that it can write stats. + virtual void leaveRegion() {}; + /// Notify this strategy that all roots have been released (including those /// that depend on EntrySU or ExitSU). virtual void registerRoots() {} @@ -769,7 +776,7 @@ /// pickNodeBidirectional depends on these listed by decreasing priority. enum CandReason { NoCand, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax, - ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, + ResourceReduce, ResourceDemand, FoldReload, BotHeightReduce, BotPathReduce, TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder}; #ifndef NDEBUG Index: include/llvm/Target/TargetInstrInfo.h =================================================================== --- include/llvm/Target/TargetInstrInfo.h +++ include/llvm/Target/TargetInstrInfo.h @@ -921,6 +921,13 @@ } public: + /// Return true if MI has an equivalent instruction that instead + /// reads one source reg from memory. If reg is 0, true is returned + /// if such an equivalent instruction exists, but if reg is given a + /// check is done that reg is used in the foldable operand. + virtual bool hasFoldableOperand(const MachineInstr *MI, + unsigned reg = 0) const { return false; } + /// unfoldMemoryOperand - Separate a single instruction which folded a load or /// a store or a load and a store into two or more instruction. If this is /// possible, returns true as well as the new instructions by reference. Index: lib/CodeGen/CalcSpillWeights.cpp =================================================================== --- lib/CodeGen/CalcSpillWeights.cpp +++ lib/CodeGen/CalcSpillWeights.cpp @@ -131,6 +131,7 @@ VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) { MachineRegisterInfo &mri = MF.getRegInfo(); const TargetRegisterInfo &tri = *MF.getSubtarget().getRegisterInfo(); + const TargetInstrInfo &tii = *MF.getSubtarget().getInstrInfo(); MachineBasicBlock *mbb = nullptr; MachineLoop *loop = nullptr; bool isExiting = false; @@ -170,6 +171,11 @@ // Calculate instr weight. bool reads, writes; std::tie(reads, writes) = mi->readsWritesVirtualRegister(li.reg); + // If mi can be transformed to fold a reload of li.reg, then + // weight for reading becomes 0. + if (reads && tii.hasFoldableOperand(mi, li.reg)) + reads = 0; + weight = LiveIntervals::getSpillWeight( writes, reads, &MBFI, mi); Index: lib/CodeGen/MachineScheduler.cpp =================================================================== --- lib/CodeGen/MachineScheduler.cpp +++ lib/CodeGen/MachineScheduler.cpp @@ -191,6 +191,7 @@ AU.setPreservesCFG(); AU.addRequiredID(MachineDominatorsID); AU.addRequired(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -371,6 +372,8 @@ // Initialize the context of the pass. MF = &mf; + MLI = &getAnalysis(); + AA = &getAnalysis().getAAResults(); PassConfig = &getAnalysis(); if (VerifyScheduling) @@ -498,6 +501,7 @@ } assert(RemainingInstrs == 0 && "Instruction count mismatch!"); Scheduler.finishBlock(); + // FIXME: Ideally, no further passes should rely on kill flags. However, // thumb2 size reduction is currently an exception, so the PostMIScheduler // needs to do this. @@ -739,6 +743,8 @@ } assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); + SchedImpl->leaveRegion(); + placeDebugValues(); DEBUG({ @@ -2376,6 +2382,7 @@ case RegMax: return "REG-MAX "; case ResourceReduce: return "RES-REDUCE"; case ResourceDemand: return "RES-DEMAND"; + case FoldReload: return "FOLDRELOAD"; case TopDepthReduce: return "TOP-DEPTH "; case TopPathReduce: return "TOP-PATH "; case BotHeightReduce:return "BOT-HEIGHT"; @@ -2500,6 +2507,26 @@ return false; } +static bool tryFoldableReload(GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand, + SchedBoundary &Zone, + const TargetInstrInfo *TII) { + bool CandReloadFoldable = TII->hasFoldableOperand(Cand.SU->getInstr()); + bool TryCandReloadFoldable = TII->hasFoldableOperand(TryCand.SU->getInstr()); + + if (Zone.isTop()) { + if (tryLess(TryCandReloadFoldable, CandReloadFoldable, + TryCand, Cand, GenericSchedulerBase::FoldReload)) + return true; + } + else { + if (tryGreater(TryCandReloadFoldable, CandReloadFoldable, + TryCand, Cand, GenericSchedulerBase::FoldReload)) + return true; + } + return false; +} + static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand, bool IsTop) { DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ") @@ -2779,7 +2806,6 @@ TryCand, Cand, RegExcess, TRI, DAG->MF)) return; - // Avoid increasing the max critical pressure in the scheduled region. if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, @@ -2834,6 +2860,14 @@ TryCand, Cand, ResourceDemand)) return; + // Try to put lower in final schedule an instruction that doesn't + // mind if a source reg gets spilled, i.e. it can fold a reload of + // it. This makes the source operands more likely to be spilled, as + // opposed to the def operand. + if (RegionPolicy.FoldableReloadHeuristic && + tryFoldableReload(TryCand, Cand, Zone, DAG->TII)) + return; + // Avoid serializing long latency dependence chains. // For acyclic path limited loops, latency was already checked above. if (!RegionPolicy.DisableLatencyHeuristic && Cand.Policy.ReduceLatency && Index: lib/Target/SystemZ/CMakeLists.txt =================================================================== --- lib/Target/SystemZ/CMakeLists.txt +++ lib/Target/SystemZ/CMakeLists.txt @@ -17,12 +17,14 @@ SystemZConstantPoolValue.cpp SystemZElimCompare.cpp SystemZFrameLowering.cpp + SystemZHazardRecognizer.cpp SystemZISelDAGToDAG.cpp SystemZISelLowering.cpp SystemZInstrInfo.cpp SystemZLDCleanup.cpp SystemZLongBranch.cpp SystemZMachineFunctionInfo.cpp + SystemZMachineScheduler.cpp SystemZMCInstLower.cpp SystemZRegisterInfo.cpp SystemZSelectionDAGInfo.cpp Index: lib/Target/SystemZ/SystemZ.td =================================================================== --- lib/Target/SystemZ/SystemZ.td +++ lib/Target/SystemZ/SystemZ.td @@ -14,6 +14,11 @@ include "llvm/Target/Target.td" //===----------------------------------------------------------------------===// +// SystemZ subtargets scheduling models. +//===----------------------------------------------------------------------===// +include "SystemZSchedule.td" + +//===----------------------------------------------------------------------===// // SystemZ supported processors and features //===----------------------------------------------------------------------===// Index: lib/Target/SystemZ/SystemZHazardRecognizer.h =================================================================== --- /dev/null +++ lib/Target/SystemZ/SystemZHazardRecognizer.h @@ -0,0 +1,161 @@ +//=-- SystemZHazardRecognizer.h - SystemZ Hazard Recognizer -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a hazard recognizer for scheduling of SystemZ +// functions. The main goal is to optimize decoder grouping. +// +// A decoder group can maximally hold 3 uops. Some instructions are +// expanded to 2 or more uops by the decoder, which means some groups +// will only contain 1 or 2 instructions. +// +// There are also instructions that have dual issue and execute on +// more than one execution unit, although the decoder only needs one +// slot for them. Currently, those extra execution units are however +// not considered. The uops modelled here represent one decoder slot +// and a usage of one processor resource. +// ===---------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZHAZARDRECOGNIZER_H +#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZHAZARDRECOGNIZER_H + +#include "SystemZSubtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/raw_ostream.h" +#include + +namespace llvm { + +/// SystemZDecodeGroupHardRecognizer reports a hazard for any +/// instruction that does not fit into current decoder group. +class SystemZDecodeGroupHazardRecognizer : public ScheduleHazardRecognizer { + const ScheduleDAG *DAG; + TargetSchedModel SchedModel; + + /// Keep track of the number of decoder slots used in current + /// decoder group. + unsigned CurGroupSize; + + /// Counters for the number of uops scheduled per processor + /// resource. + SmallVector ProcResourceCounters; + + /// Since the ProcResources are not currently enumerated, look this + /// index up and store it. + unsigned FPD_RESOURCE_IDX; + + /// Return MCSchedClassDesc for SU, or nullptr if not available. + inline const MCSchedClassDesc *getSchedClassDesc(const SUnit *SU) const { + const MCInstrDesc *MCIDesc = DAG->getInstrDesc(SU); + if (MCIDesc != nullptr) { + unsigned Idx = MCIDesc->getSchedClass(); + if (Idx) + return SchedModel.getMCSchedModel()->getSchedClassDesc(Idx); + } + return nullptr; + } + + /// Return number of uops as defined in .td file. + inline unsigned getNumMicroOps(const SUnit *SU) const { + const MCSchedClassDesc *SC = getSchedClassDesc(SU); + if (SC == nullptr) + return 1; + unsigned NumUOps = SC->NumMicroOps; + + // If instruction has more than three explicit source registers, + // it will limit the individual decode group to 2 uops. + if (NumUOps == 1 && hasPlus3Sources(SU)) + return 2; + + return NumUOps; + } + + /// Return true if SU fits into current decoder group. + bool fitsIntoCurrentGroup(SUnit *SU) const; + + /// Initialize hazard recognizer before scheduling a region. + void init(); + + unsigned numGroupsPerCycle() { + return SchedModel.getMCSchedModel()->IssueWidth / 3; + } + + /// Two multicycle (div/sqrt) BFP operations should preferrably not + /// be issued to the same processor side, since that will incur a + /// stall (blocking execution unit). + /// Return true if SU is a BFP multicycle instruction. + bool isBFPMultiCycle(const SUnit *SU) const; + + /// True if current group contains a multi cycle op. + bool currGroupHasMultiCycleOp; + + /// Return true if the instruction has more than three sources, + /// which will limit the group to 2 uops instead of 3. + bool hasPlus3Sources(const SUnit *SU) const; + + /// (Experimental) Statistics counters + unsigned InsCount; + unsigned GrpCount; + unsigned MaxHeight; + unsigned SPAccesses; + unsigned MaxScheduledLatency; + unsigned QueuedUnits; + unsigned MaxQueued; + unsigned Noops; + unsigned Stalls; + unsigned Groupers; + + // Loop depth is considered for loop-weighted statistics. + unsigned LoopDepth; + unsigned getLoopWeight() { return (LoopDepth ? LoopDepth * 50 : 1); } + +public: + SystemZDecodeGroupHazardRecognizer(const ScheduleDAG *DAG_); + + HazardType getHazardType(SUnit *m, int Stalls = 0) override; + void Reset() override; + void EmitInstruction(SUnit *SU) override; + unsigned PreEmitNoops(SUnit *) override; + + /// Start next decoder group. + void nextGroup(); + + // Cost functions used by SystemZPostRASchedStrategy while + // evaluating candidates. + bool newGroupAndSUMustBegin(const SUnit *SU) const; + bool mustEndSUWouldCompleteGroup(const SUnit *SU) const; + int groupingCost(const SUnit *SU) const; + bool multiCycleStallInGroup(const SUnit *SU) const; + unsigned resourcesCost(const SUnit *SU) const; + +#ifndef NDEBUG + // Debug dumping. + std::string CurGroupDbg; // current group as text + void dumpSU(SUnit *SU, raw_ostream &OS) const; + void dumpCurrGroup(std::string Msg = "") const; + void dumpProcResourceCounters() const; +#endif + + /// Update statistics after scheduling. + void doStats(); + void resourcesQueued(const SUnit *SU, unsigned &ResQueued, + unsigned &MaxQ) const; + + /// Set loop depth for loop weighted statistics. + void setLoopDepth(unsigned d) { LoopDepth = d; } +}; + +} // end namespace llvm + +#endif Index: lib/Target/SystemZ/SystemZHazardRecognizer.cpp =================================================================== --- /dev/null +++ lib/Target/SystemZ/SystemZHazardRecognizer.cpp @@ -0,0 +1,489 @@ +//=-- SystemZHazardRecognizer.h - SystemZ Hazard Recognizer -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "SystemZHazardRecognizer.h" +#include "SystemZRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" +#include +#include "llvm/ADT/Statistic.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +// Plenty of (experimental) statistics. + +STATISTIC(NumInstrs, "SystemZ: Number of instructions scheduled (all)."); +STATISTIC(NumDecoderGroups, "SystemZ: Number of decoder groups scheduled."); + +// Too much parallellism pre-ra will result in spilling +STATISTIC(NumSPAccesses, "SystemZ: Number of SP accesses"); + +// Too little prallellism pre-ra will result in unnecessarily +// dependent instructions, if the reg-alloc reuses registers. That +// will make the post-ra DAGs higher. +STATISTIC(NumDAGHeights, "SystemZ: Total height of all DAGs (post RA)"); + +// The post-ra measure of the static latency of the scheduled regions. +// The more priority scheduler puts on latency, the smaller this will be. +STATISTIC(RegionLatency, "SystemZ: The scheduled latency for regions."); + +// How many instruction did the post-ra scheduler think it scheduled +// when exec units were not beleived to be available? +STATISTIC(ExecUnitsQueues,"SystemZ: FUs queued"); + +// What did the post-ra scheduler think was the longest exec unit +// queue for any scheduled instruction? +STATISTIC(MaxExecUnitQueue, "SystemZ: Sum of longest FU queues in regions"); + +STATISTIC(NumNoops, "SystemZ: Number of noops inserted"); +STATISTIC(NumStalls, "SystemZ: Number of stall cycles (FPd unit)"); + +STATISTIC(NumGroupers, "SystemZ: Number of grouping instructions"); + +// Loop weighted versions. A loop is weighted to (50 * loop-depth). +STATISTIC(NumDecoderGroupsWeighted, + "SystemZ: Number of decoder groups scheduled, Loop Weighted."); +STATISTIC(NumDAGHeightsWeighted, + "SystemZ: Total height of all DAGs (post RA), Loop Weighted."); +STATISTIC(RegionLatencyWeighted, + "SystemZ: The scheduled latency for regions, Loop Weighted."); +STATISTIC(ExecUnitsQueuesWeighted, + "SystemZ: FUs queued, Loop Weighted."); +STATISTIC(MaxExecUnitQueueWeighted, + "SystemZ: Sum of longest FU queues in regions, Loop Weighted."); +STATISTIC(NumStallsWeighted, + "SystemZ: Number of stall cycles (FPd unit), Loop Weighted."); +STATISTIC(NumGroupersWeighted, + "SystemZ: Number of grouping instructions, Loop Weighted."); + + +// XXJ option which turns off the hazard recognizer, to check the +// value of it. +static cl::opt Active("decgroups", cl::Hidden, + cl::init(true)); + +// Experimental option: Insert nops to not put two multi cycle ops in +// the same decoder group. +static cl::opt FPUdNops("fpudnops", cl::Hidden, + cl::desc("SystemZ: Insert nops to separate multi cycle ops."), + cl::init(true)); + +SystemZDecodeGroupHazardRecognizer:: +SystemZDecodeGroupHazardRecognizer(const ScheduleDAG *DAG_) : DAG(DAG_) { + const SystemZSubtarget &ST = + static_cast(DAG->MF.getSubtarget()); + SchedModel.init(ST.getSchedModel(), &ST, ST.getInstrInfo()); + + // Set to 1 to indicate 'enabled'. + MaxLookAhead = 1; + + // Find out what is the index for the FPd unit if it is part of the + // sched model (enum values are currently not available). FIXME: + // This would probably be better handled with a general modelling of + // stalling executional units. + FPD_RESOURCE_IDX = 0; // InvalidUnit + for (unsigned PIdx = 1, PEnd = SchedModel.getNumProcResourceKinds(); + PIdx != PEnd; ++PIdx) + if (SchedModel.getProcResource(PIdx)->BufferSize == 0) { + assert (FPD_RESOURCE_IDX == 0 && "Cannot assume this FPd unit?"); + FPD_RESOURCE_IDX = PIdx; + } + + init(); +} + +bool SystemZDecodeGroupHazardRecognizer:: +isBFPMultiCycle(const SUnit *SU) const { + // If FPd unit is not present, return false. + if (FPD_RESOURCE_IDX == 0) + return false; + + const MCSchedClassDesc *SC = getSchedClassDesc(SU); + if (SC != nullptr) { + for (TargetSchedModel::ProcResIter + PI = SchedModel.getWriteProcResBegin(SC), + PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) + if (PI->ProcResourceIdx == FPD_RESOURCE_IDX) + return true; + } + return false; +} + + +// Return true if the instruction has more than three sources, which +// will limit the group to 2 uops instead of 3. +bool SystemZDecodeGroupHazardRecognizer:: +hasPlus3Sources(const SUnit *SU) const { + unsigned NumSources = 0; + for (auto &MO : SU->getInstr()->operands()) + if (MO.isReg() && MO.isUse() && !MO.isImplicit()) + ++NumSources; + return (NumSources > 3); +} + +ScheduleHazardRecognizer::HazardType SystemZDecodeGroupHazardRecognizer:: +getHazardType(SUnit *m, int Stalls) { + if (!Active) // XXJ + return NoHazard; + + return (fitsIntoCurrentGroup(m) ? NoHazard : Hazard); +} + +void SystemZDecodeGroupHazardRecognizer::Reset() { + init(); +} + +void SystemZDecodeGroupHazardRecognizer::init() { + CurGroupSize = 0; + + currGroupHasMultiCycleOp = false; + + InsCount = 0; + GrpCount = 0; + MaxHeight = 0; + SPAccesses = 0; + MaxScheduledLatency = 0; + QueuedUnits = 0; + MaxQueued = 0; + Noops = 0; + Stalls = 0; + Groupers = 0; + + LoopDepth = 0; + + ProcResourceCounters.resize(SchedModel.getNumProcResourceKinds()); + for (unsigned i = 0; i < SchedModel.getNumProcResourceKinds(); ++i) + ProcResourceCounters[i] = 0; + + DEBUG(CurGroupDbg = "";); +} + +bool +SystemZDecodeGroupHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const { + if (CurGroupSize == 0) + return true; + + // If instruction must begin group, it cannot be added to current + // group (empty group checked for above). + const MCSchedClassDesc *SC = getSchedClassDesc(SU); + if (SC != nullptr && SC->BeginGroup) + return false; + + unsigned NumUOps = getNumMicroOps(SU); + + // If SU has a scheduling class, subtarget must fill in the proper + // values. At least one micro op is expected. + assert ( NumUOps > 0 && + "Missing subtarget scheduler input for SU?"); + + // Any instruction using 2 or more uops also begins a new group, + // which was handled above. + assert (NumUOps == 1 && + "Instruction with multiple uops does not begin group?"); + + // Since a full group is handled immediately in EmitInstruction(), + // SU should fit into current group. + assert ((CurGroupSize + NumUOps <= 3) && + "Expected non-full group!"); + + return true; +} + +// Start next decoder group. +void SystemZDecodeGroupHazardRecognizer::nextGroup() { + if (CurGroupSize > 0) { + DEBUG(dumpCurrGroup("Completed decode group")); + DEBUG(CurGroupDbg = "";); + + // Reset current group + CurGroupSize = 0; + currGroupHasMultiCycleOp = false; + + GrpCount++; + + // Decrease counters for execution units by one. + for (unsigned i = 0; i < SchedModel.getNumProcResourceKinds(); ++i) + if (ProcResourceCounters[i] > 0) + ProcResourceCounters[i]--; + } + + DEBUG(dumpProcResourceCounters();); +} + +#ifndef NDEBUG +// Debug output +void SystemZDecodeGroupHazardRecognizer:: +dumpSU(SUnit *SU, raw_ostream &OS) const{ + OS << "SU(" << SU->NodeNum << ")"; + OS << ":" << DAG->TII->getName(SU->getInstr()->getOpcode()); + const MCSchedClassDesc *SC = getSchedClassDesc(SU); + if (SC != nullptr) { + for (TargetSchedModel::ProcResIter + PI = SchedModel.getWriteProcResBegin(SC), + PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) { + const MCProcResourceDesc &PRD = + *SchedModel.getProcResource(PI->ProcResourceIdx); + std::string U(PRD.Name); + + // trim e.g. Z13_FXUnit -> FXU + if (U.find("FXU") != std::string::npos) + OS << "/FXU"; + else if (U.find("VFU") != std::string::npos) + OS << "/VFU"; + else if (U.find("FPU") != std::string::npos) + OS << "/FPU"; + else if (U.find("FPd") != std::string::npos) + OS << "/FPd"; + else if (U.find("LSU") != std::string::npos) + OS << "/LSU"; + else if (U.find("VBU") != std::string::npos) + OS << "/VBU"; + + if (PI->Cycles > 1) + OS << "(" << PI->Cycles << "cyc)"; + } + + if (SC->NumMicroOps > 1) + OS << "/" << SC->NumMicroOps << "uops"; + if (SC->BeginGroup && SC->EndGroup) + OS << "/GroupsAlone"; + else if (SC->BeginGroup) + OS << "/BeginsGroup"; + else if (SC->EndGroup) + OS << "/EndsGroup"; + } +} + +void SystemZDecodeGroupHazardRecognizer:: +dumpCurrGroup(std::string Msg) const { + dbgs() << "+++ " << Msg << ": "; + if (CurGroupDbg.empty()) + dbgs() << " \n"; + else { + dbgs() << "{ " << CurGroupDbg << " }"; + dbgs() << " (" << CurGroupSize << (CurGroupSize > 1 ? "uops":"uop") + << ")\n"; + } +} + +void SystemZDecodeGroupHazardRecognizer::dumpProcResourceCounters() const { + for (unsigned i = 0; i < SchedModel.getNumProcResourceKinds(); ++i) + if (ProcResourceCounters[i] > 0) { + dbgs() << "+++ Extra schedule for execution unit " + << SchedModel.getProcResource(i)->Name + << ": " << ProcResourceCounters[i] << "\n"; + } +} +#endif + +// Update statistics after scheduling. +void SystemZDecodeGroupHazardRecognizer::doStats() { + if (InsCount <= 1) + return; + + if (CurGroupSize) + nextGroup(); + + // Print average number of instructions per decoder group for region. + DEBUG(char Tmp[16]; + float Ratio = ((float) InsCount / GrpCount); + sprintf(Tmp, "%.3f", Ratio); + dbgs() << "+++ stats: " << InsCount + << " instructions, " << GrpCount << " decoder group(s), Ratio: " + << Tmp << ", DAG height: " << MaxHeight << ", SP: " + << SPAccesses << "\n";); + + NumInstrs += InsCount; + + NumDecoderGroups += GrpCount; + NumDecoderGroupsWeighted += GrpCount * getLoopWeight(); + + NumSPAccesses += SPAccesses; + + NumDAGHeights += MaxHeight; + NumDAGHeightsWeighted += MaxHeight * getLoopWeight(); + + RegionLatency += MaxScheduledLatency; + RegionLatencyWeighted += MaxScheduledLatency * getLoopWeight(); + + ExecUnitsQueues += QueuedUnits; + ExecUnitsQueuesWeighted += QueuedUnits * getLoopWeight(); + + MaxExecUnitQueue += MaxQueued; + MaxExecUnitQueueWeighted += MaxQueued * getLoopWeight(); + + NumNoops += Noops; + + NumStalls += Stalls; + NumStallsWeighted += Stalls * getLoopWeight(); + + NumGroupers += Groupers; + NumGroupersWeighted += Groupers * getLoopWeight(); +} + +// Update state by taking SU as next instruction. +void SystemZDecodeGroupHazardRecognizer::EmitInstruction(SUnit *SU) { + const MCSchedClassDesc *SC = getSchedClassDesc(SU); + assert (fitsIntoCurrentGroup(SU) && "Emitted SU does not fit in group?"); + + DEBUG(dumpCurrGroup("Decode group before emission");); + DEBUG(dbgs() << "+++ HazardRecognizer emitting "; dumpSU(SU, dbgs()); + dbgs() << "\n"; + raw_string_ostream cgd(CurGroupDbg); + if (CurGroupDbg.length()) + cgd << ", "; + dumpSU(SU, cgd)); + + // Do statistics first + InsCount++; + if (SU->getHeight() > MaxHeight) + MaxHeight = SU->getHeight(); + const MachineInstr *MI = SU->getInstr(); + if ((MI->mayLoad() || MI->mayStore()) && + MI->readsRegister(SystemZ::R15D)) + SPAccesses++; + + unsigned CurrDecoderCycle = GrpCount / numGroupsPerCycle(); + unsigned LatencyToEnd = CurrDecoderCycle + SU->getHeight(); + if (LatencyToEnd > MaxScheduledLatency) + MaxScheduledLatency = LatencyToEnd; + + unsigned ResQueued, MaxQ; + resourcesQueued(SU, ResQueued, MaxQ); + QueuedUnits += ResQueued; + if (MaxQ > MaxQueued) + MaxQueued = MaxQ; + + if (SC != nullptr && (SC->BeginGroup || (SC->EndGroup))) + Groupers++; + + // Keep track of number of uops in current group. + CurGroupSize += getNumMicroOps(SU); + + // Make note of a scheduled blocking multi cycle op. + if (isBFPMultiCycle(SU)) { + if (currGroupHasMultiCycleOp) + Stalls += 8; + currGroupHasMultiCycleOp = true; + } + + // Increase counter for execution unit(s). + if (SC != nullptr) { + for (TargetSchedModel::ProcResIter + PI = SchedModel.getWriteProcResBegin(SC), + PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) + ProcResourceCounters[PI->ProcResourceIdx] += PI->Cycles; + } + + // Check if current group is now full. Ops with more than 3 uops or + // ops that have a dynamic number of uops (such as Load Multiple), + // are not handled any further than ending the current group. + if (CurGroupSize >= 3 || (SC != nullptr && SC->EndGroup)) + nextGroup(); +} + +// Emit nop(s) to put FPd (blocking multicycle) SU into the next +// decoder group, if it is the second one into the group. The nop +// insertion is done by the sched strategy immediately upon returning +// from here, then EmitInstruction() is called. +unsigned SystemZDecodeGroupHazardRecognizer::PreEmitNoops(SUnit *SU) { + if (FPUdNops && + currGroupHasMultiCycleOp && + isBFPMultiCycle(SU)) { + unsigned Num = (3 - CurGroupSize); + nextGroup(); + Noops += Num; + return Num; + } + return 0; +} + +bool SystemZDecodeGroupHazardRecognizer:: +newGroupAndSUMustBegin(const SUnit *SU) const { + if (CurGroupSize) + return false; + const MCSchedClassDesc *SC = getSchedClassDesc(SU); + return (SC != nullptr && SC->BeginGroup); +} + +bool SystemZDecodeGroupHazardRecognizer:: +mustEndSUWouldCompleteGroup(const SUnit *SU) const { + if (!CurGroupSize) + return false; + const MCSchedClassDesc *SC = getSchedClassDesc(SU); + return (SC != nullptr && !SC->BeginGroup && SC->EndGroup && + (getNumMicroOps(SU) == 3 - CurGroupSize)); +} + +int SystemZDecodeGroupHazardRecognizer:: +groupingCost(const SUnit *SU) const { + if (newGroupAndSUMustBegin(SU) || mustEndSUWouldCompleteGroup(SU)) + return -1; + + const MCSchedClassDesc *SC = getSchedClassDesc(SU); + if (SC == nullptr) + return 0; + + int cost = 0; + if (SC->BeginGroup && CurGroupSize) + cost = (3 - CurGroupSize); + + if (SC->EndGroup) { + unsigned resultingGroupSize = (CurGroupSize + getNumMicroOps(SU)); + if (resultingGroupSize < 3) + cost = 3 - resultingGroupSize; + } + + return cost; +} + +// Return true if scheduling this SU would mean putting a second multi +// cycle stalling instruction into current group. +bool SystemZDecodeGroupHazardRecognizer:: +multiCycleStallInGroup(const SUnit *SU) const { + return (currGroupHasMultiCycleOp && isBFPMultiCycle(SU)); +} + +// Return the number of queued cycles per processor resource before +// scheduling SU. +unsigned SystemZDecodeGroupHazardRecognizer:: +resourcesCost(const SUnit *SU) const { + const MCSchedClassDesc *SC = getSchedClassDesc(SU); + unsigned cost = 0; + if (SC != nullptr) { + for (TargetSchedModel::ProcResIter + PI = SchedModel.getWriteProcResBegin(SC), + PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) + cost += ProcResourceCounters[PI->ProcResourceIdx]; + } + return cost; +} + +// At the point of scheduling SU, check how many resource cycles of SU +// will be queued +void SystemZDecodeGroupHazardRecognizer:: +resourcesQueued(const SUnit *SU, unsigned &ResQueued, unsigned &MaxQ) const { + const MCSchedClassDesc *SC = getSchedClassDesc(SU); + ResQueued = 0; + MaxQ = 0; + if (SC != nullptr) { + for (TargetSchedModel::ProcResIter + PI = SchedModel.getWriteProcResBegin(SC), + PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) { + unsigned PIdx = PI->ProcResourceIdx; + if (ProcResourceCounters[PIdx] + PI->Cycles > 1) { + unsigned q = ProcResourceCounters[PIdx] + PI->Cycles - 1; + ResQueued += q; + if (ResQueued > MaxQ) + MaxQ = ResQueued; + } + } + } +} Index: lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.h +++ lib/Target/SystemZ/SystemZISelLowering.h @@ -426,6 +426,9 @@ MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const override; + + const TargetRegisterClass *getRepRegClassFor(MVT VT) const override; + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; bool allowTruncateForTailCall(Type *, Type *) const override; bool mayBeEmittedAsTailCall(CallInst *CI) const override; Index: lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.cpp +++ lib/Target/SystemZ/SystemZISelLowering.cpp @@ -27,6 +27,10 @@ #define DEBUG_TYPE "systemz-lower" +cl::opt SchedPref("schedpref", + cl::desc("Experimental: SystemZ SchedPref"), + cl::init("source"), cl::Hidden); + namespace { // Represents a sequence for extracting a 0/1 value from an IPM result: // (((X ^ XORValue) + AddValue) >> Bit) @@ -116,11 +120,21 @@ // Set up special registers. setStackPointerRegisterToSaveRestore(SystemZ::R15D); - // TODO: It may be better to default to latency-oriented scheduling, however - // LLVM's current latency-oriented scheduler can't handle physreg definitions - // such as SystemZ has with CC, so set this to the register-pressure - // scheduler, because it can. - setSchedulingPreference(Sched::RegPressure); + // XXJ Experimental. + if (Subtarget.isZ10()) + setSchedulingPreference(Sched::RegPressure); + else { + if (SchedPref=="source") + setSchedulingPreference(Sched::Source); + else if (SchedPref=="hybrid") + setSchedulingPreference(Sched::Hybrid); + else if (SchedPref=="ilp") + setSchedulingPreference(Sched::ILP); + else if (SchedPref=="regpress") + setSchedulingPreference(Sched::RegPressure); + else + llvm_unreachable("bad schedpref string"); + } setBooleanContents(ZeroOrOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); @@ -5927,3 +5941,14 @@ llvm_unreachable("Unexpected instr type to insert"); } } + +const TargetRegisterClass *SystemZTargetLowering:: +getRepRegClassFor(MVT VT) const { + // This can unfortunately not distinguish between integer / + // vector registers. Do they both need to be 'untyped'? + // (This is called if ilp-list scheduler is used.) + if (VT == MVT::Untyped) + return &SystemZ::GR128BitRegClass; + + return TargetLowering::getRepRegClassFor(VT); +} Index: lib/Target/SystemZ/SystemZInstrInfo.h =================================================================== --- lib/Target/SystemZ/SystemZInstrInfo.h +++ lib/Target/SystemZ/SystemZInstrInfo.h @@ -17,6 +17,7 @@ #include "SystemZ.h" #include "SystemZRegisterInfo.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetSchedule.h" #define GET_INSTRINFO_HEADER #include "SystemZGenInstrInfo.inc" @@ -117,6 +118,7 @@ class SystemZInstrInfo : public SystemZGenInstrInfo { const SystemZRegisterInfo RI; SystemZSubtarget &STI; + TargetSchedModel SchedModel; void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const; void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const; @@ -151,6 +153,13 @@ unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef Cond, DebugLoc DL) const override; + ScheduleHazardRecognizer* + CreateTargetMIHazardRecognizer(const InstrItineraryData*, + const ScheduleDAG *DAG) const override; + + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; + bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, int &Mask, int &Value) const override; bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, @@ -192,6 +201,8 @@ MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const override; bool expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const override; + bool hasFoldableOperand(const MachineInstr *MI, unsigned reg = 0) const + override; bool ReverseBranchCondition(SmallVectorImpl &Cond) const override; @@ -240,6 +251,16 @@ void loadImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned Reg, uint64_t Value) const; + + // Sometimes, it is possible for the target to tell, even without + // aliasing information, that two MIs access different memory + // addresses. This function returns true if two MIs access different + // memory addresses and false otherwise. + bool + areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb, + AliasAnalysis *AA = nullptr) const override; + + int getMemOpcode(unsigned opc) const; }; } // end namespace llvm Index: lib/Target/SystemZ/SystemZInstrInfo.cpp =================================================================== --- lib/Target/SystemZ/SystemZInstrInfo.cpp +++ lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -14,8 +14,11 @@ #include "SystemZInstrInfo.h" #include "SystemZInstrBuilder.h" #include "SystemZTargetMachine.h" +#include "SystemZHazardRecognizer.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" using namespace llvm; @@ -23,6 +26,8 @@ #define GET_INSTRMAP_INFO #include "SystemZGenInstrInfo.inc" +#define DEBUG_TYPE "systemz-instr-info" + // Return a mask with Count low bits set. static uint64_t allOnes(unsigned int Count) { return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1; @@ -43,6 +48,8 @@ SystemZInstrInfo::SystemZInstrInfo(SystemZSubtarget &sti) : SystemZGenInstrInfo(SystemZ::ADJCALLSTACKDOWN, SystemZ::ADJCALLSTACKUP), RI(), STI(sti) { + + SchedModel.init(STI.getSchedModel(), &STI, STI.getInstrInfo()); } // MI is a 128-bit load or store. Split it into two 64-bit loads or stores, @@ -359,6 +366,27 @@ } bool SystemZInstrInfo:: +hasFoldableOperand(const MachineInstr *MI, unsigned reg) const { + if (SystemZ::getMemOpcode(MI->getOpcode()) == -1) + return false; + + // If MI is mapped to a memory-opcode, it can fold one of its + // operands in case that operand register gets spilled. If reg is 0, + // we don't know which operand might be spilled, but the mischeduler + // TryCandidate() can help things a bit generally by putting MI a + // bit lower in final schedule. + if (!reg) + return true; + + // If reg is given, check if that operand could be folded if placed + // on stack. CalcSpillWeights will in this case decrease the cost + // estimate for spilling the register. + unsigned NumOps = MI->getNumExplicitOperands(); + const MachineOperand &MO = MI->getOperand(NumOps - 1); + return (MO.isReg() && MO.getReg() == reg); +} + +bool SystemZInstrInfo:: ReverseBranchCondition(SmallVectorImpl &Cond) const { assert(Cond.size() == 2 && "Invalid condition"); Cond[1].setImm(Cond[1].getImm() ^ Cond[0].getImm()); @@ -402,6 +430,24 @@ return Count; } +ScheduleHazardRecognizer* SystemZInstrInfo:: +CreateTargetMIHazardRecognizer(const InstrItineraryData*, + const ScheduleDAG *DAG) const { + bool isPreRA = DAG->MRI.getNumVirtRegs(); + + if (!isPreRA && SchedModel.hasInstrSchedModel()) + return new SystemZDecodeGroupHazardRecognizer(DAG); + + // Dummy hazard recognizer allows all instructions to issue. + return new ScheduleHazardRecognizer(); +} + +void SystemZInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + BuildMI(MBB, MI, DebugLoc(), get(SystemZ::LR), SystemZ::R0L) + .addReg(SystemZ::R0L); +} + bool SystemZInstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, int &Mask, int &Value) const { @@ -1292,3 +1338,46 @@ } BuildMI(MBB, MBBI, DL, get(Opcode), Reg).addImm(Value); } + +bool SystemZInstrInfo:: +areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb, + AliasAnalysis *AA) const { + + if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand()) + return false; + + // If mem-operands show that the same address Value is used by both + // instructions, check for non-overlapping offsets and widths. Not + // sure if a register based analysis would be an improvement... + + MachineMemOperand *MMOa = *MIa->memoperands_begin(); + MachineMemOperand *MMOb = *MIb->memoperands_begin(); + const Value *VALa = MMOa->getValue(); + const Value *VALb = MMOb->getValue(); + bool SameVal = (VALa && VALb && (VALa == VALb)); + if (!SameVal) { + const PseudoSourceValue *PSVa = MMOa->getPseudoValue(); + const PseudoSourceValue *PSVb = MMOb->getPseudoValue(); + if (PSVa && PSVb && (PSVa == PSVb)) + SameVal = true; + } + if (SameVal) { + int OffsetA = MMOa->getOffset(), OffsetB = MMOb->getOffset(); + int WidthA = MMOa->getSize(), WidthB = MMOb->getSize(); + int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; + int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; + int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; + if (LowOffset + LowWidth <= HighOffset) + return true; + } + + return false; +} + +// A wrapper around the generated function, since it can't be included +// and built twice. XXJ Remove? +int SystemZInstrInfo:: +getMemOpcode(unsigned opc) const { + return SystemZ::getMemOpcode(opc); +} + Index: lib/Target/SystemZ/SystemZInstrInfo.td =================================================================== --- lib/Target/SystemZ/SystemZInstrInfo.td +++ lib/Target/SystemZ/SystemZInstrInfo.td @@ -1222,7 +1222,7 @@ // A serialization instruction that acts as a barrier for all memory // accesses, which expands to "bcr 14, 0". let hasSideEffects = 1 in -def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>; + def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>; let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in { def LAA : LoadAndOpRSY<"laa", 0xEBF8, atomic_load_add_32, GR32>; Index: lib/Target/SystemZ/SystemZMachineScheduler.h =================================================================== --- /dev/null +++ lib/Target/SystemZ/SystemZMachineScheduler.h @@ -0,0 +1,116 @@ +//==-- SystemZMachineScheduler.h - SystemZ Scheduler Interface -*- C++ -*---==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// SystemZ Machine Scheduler interface +// This scheduler is run just before register allocation. +// +//===----------------------------------------------------------------------===// + +#include "SystemZInstrInfo.h" +#include "SystemZHazardRecognizer.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/Support/Debug.h" + +#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H +#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H + +using namespace llvm; + +namespace llvm { + +class SystemZPostRASchedStrategy : public MachineSchedStrategy { + + struct Candidate { + SUnit *SU; + + // GroupingCost is negative if it would be a win to schedule this + // SU next, or positive if it would break the group early. + int GroupingCost; + + bool MultiCycleStall; + unsigned ResourcesCost; + Candidate() : SU(nullptr), GroupingCost(0), MultiCycleStall(false), + ResourcesCost(0) {} + Candidate(SUnit *SU_, SystemZDecodeGroupHazardRecognizer *HazRec, + bool NegGroupingCost); + + // Compare two candidates. + bool operator<(const Candidate &other); + + // Check if this node is as good as it could be. + bool noCost() { + return (GroupingCost <= 0 && !MultiCycleStall && !ResourcesCost); + } + }; + + // Keep all available SUs in a set sorted by their heights. + struct SUSorter { + bool operator() (const SUnit *lhs, const SUnit *rhs) const { + if (lhs->getHeight() > rhs->getHeight()) + return true; + else if (lhs->getHeight() < rhs->getHeight()) + return false; + return (lhs->NodeNum < rhs->NodeNum); + } + }; + struct SUSet : std::set { + #ifndef NDEBUG + void dump(SystemZDecodeGroupHazardRecognizer *HazRec); + #endif + }; + + ScheduleDAGMI *DAG; + + const MachineLoopInfo *MLI; + // Loop of current region, or nullptr + const MachineLoop *Loop; + + // All available nodes, sorted by height. + SUSet Available; + + // HazardRecognizer that tracks decoder groups. + SystemZDecodeGroupHazardRecognizer *HazRec; + public: + SystemZPostRASchedStrategy(const MachineSchedContext *C) : + DAG(nullptr), MLI(C->MLI), Loop(nullptr), HazRec(nullptr) {} + virtual ~SystemZPostRASchedStrategy() { delete HazRec; } + + + /// Called before each region + void initPolicy(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) override; + + void leaveRegion() override { HazRec->doStats(); } + + /// PostRA scheduling does not track pressure. + bool shouldTrackPressure() const override { return false; } + + /// Initialize the strategy after building the DAG for a new region. + void initialize(ScheduleDAGMI *dag) override; + + /// Pick the next node to schedule, or return NULL. + SUnit *pickNode(bool &IsTopNode) override; + + /// ScheduleDAGMI has scheduled an instruction - tell HazardRec + /// about it. + void schedNode(SUnit *SU, bool IsTopNode) override; + + /// SU has had all predecessor dependencies resolved. Put it into + /// Available. + void releaseTopNode(SUnit *SU) override; + + /// Currently only scheduling top-down, so this method is empty. + void releaseBottomNode(SUnit *SU) override {}; +}; + +} // namespace llvm + +#endif /* LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H */ Index: lib/Target/SystemZ/SystemZMachineScheduler.cpp =================================================================== --- /dev/null +++ lib/Target/SystemZ/SystemZMachineScheduler.cpp @@ -0,0 +1,288 @@ +//==-- Systemzmachinescheduler.h - SystemZ Scheduler Interface -*- C++ -*---==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// SystemZ Machine Scheduler interface +// This scheduler is run just before register allocation. +// +//===----------------------------------------------------------------------===// + +#include "SystemZMachineScheduler.h" +#include "SystemZSubtarget.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +// XXJ option which reverses heuristics by reversing the candidate +// evaluation score. +static cl::opt DoMyWorst("domyworst", cl::Hidden, + cl::init(false)); + +// XXJ option to make scheduler output same order, to get hazard +// recognizer statistics without scheduling. +static cl::opt SchedNoChange("schednochange", cl::Hidden, + cl::init(false)); + +// XXJ option to make scheduler output a random ("unscheduled") order. +#include /* srand, rand */ +#include /* time */ +static cl::opt SchedRandom("schedrandom", cl::Hidden, + cl::init(false)); + +// XXJ option to make scheduler just schedule per height +static cl::opt SchedByHeight("schedheight", cl::Hidden, + cl::init(false)); + +static cl::opt DisableResources("sched-noresources", cl::Hidden, + cl::init(false)); + + +// ------------------------ Post RA scheduling ---------------------------- // +// SystemZPostRASchedStrategy is a scheduling strategy which is +// plugged into the MachineScheduler. It has an Available set of SUs +// sorted by height, and a pickNode() implementation that schedules by +// height while also filling decoder groups and balancing the use +// resources. + +#ifndef NDEBUG +// Print the set of SUs +void SystemZPostRASchedStrategy::SUSet:: +dump(SystemZDecodeGroupHazardRecognizer *HazRec) { + dbgs() << "{"; + for (auto &SU : *this) { + HazRec->dumpSU(SU, dbgs()); + if (SU != *rbegin()) + dbgs() << ", "; + } + dbgs() << "}\n"; +} +#endif + +void SystemZPostRASchedStrategy::initPolicy(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) { + const MachineBasicBlock *MBB = Begin->getParent(); + Loop = MLI->getLoopFor(MBB); +} + +void SystemZPostRASchedStrategy::initialize(ScheduleDAGMI *dag) { + DAG = dag; + if (HazRec == nullptr) + HazRec = new SystemZDecodeGroupHazardRecognizer(DAG); + else + HazRec->Reset(); + + HazRec->setLoopDepth(Loop ? Loop->getLoopDepth() : 0); +} + +// Pick the next node to schedule. +SUnit *SystemZPostRASchedStrategy::pickNode(bool &IsTopNode) { + // Only scheduling top-down. + IsTopNode = true; + + // All nodes that are possible to schedule are stored by heigth in + // the Available set. This includes any node with all predecessors + // scheduled. + DEBUG(dbgs() << "Available: "; Available.dump(HazRec);); + if (Available.empty()) + return nullptr; + + SUnit *Next = nullptr; + + // If only one choice, return it. + if (Available.size() == 1) { + Next = *Available.begin(); + Available.erase(Next); + if (HazRec->getHazardType(Next) != ScheduleHazardRecognizer::NoHazard) + HazRec->nextGroup(); + return Next; + } + + // Experimental: Output a random order schedule, meaning "no + // scheduling" + if (SchedRandom) { + srand (time(NULL)); + int num = rand() % Available.size(); + SUSet::const_iterator it(Available.begin()); + advance(it, num); + Next = *it; + Available.erase(Next); + if (HazRec->getHazardType(Next) != ScheduleHazardRecognizer::NoHazard) + HazRec->nextGroup(); + return Next; + } + + // Experimental: Output an unchanged order of instructions, in order + // to get statistics for it. + if (SchedNoChange) { + SUnit *NextNodeNum = nullptr; + for (auto *SU : Available) + if (NextNodeNum == nullptr || + SU->NodeNum < NextNodeNum->NodeNum) + NextNodeNum = SU; + Available.erase(NextNodeNum); + if (HazRec->getHazardType(NextNodeNum) != ScheduleHazardRecognizer::NoHazard) + HazRec->nextGroup(); + return NextNodeNum; + } + + Candidate Best; + unsigned NumChecked = 0; + SUnit *HighestSU = *Available.begin(); + for (auto *SU : Available) { + // XXX Schedule high stalling SUs : SU->hasReservedResource + + // Check with HazRec if this SU fits into current decoder group. + if (HazRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) + continue; + + // Experimental: If scheduling by height only, return first SU + // that fits into current decoder group. + if (SchedByHeight) { + Next = SU; + break; + } + + // SU is the next candidate, with cost values that are compared to + // other SUs. Always check five highest SUs in case there is an SU + // that must begin / end a group that would be preferrable to + // schedule now. + Candidate c(SU, HazRec, (NumChecked++ < 5) /* check begin/end group */); + if (c.GroupingCost < 0) { + Next = SU; + break; + } + + // Remeber which SU is the best candidate. + if (Best.SU == nullptr || c < Best) { + Best = c; + DEBUG(dbgs() << "Best sofar: "; + HazRec->dumpSU(Best.SU, dbgs()); + dbgs() << "\tGrouping cost:" << c.GroupingCost; + if (Best.MultiCycleStall) + dbgs() << " "; + else + dbgs() << " "; + dbgs() << " Resource cost:" << Best.ResourcesCost + << " Height:" << Best.SU->getHeight() << "\n";); + } + + // If more than five SUs have been checked, there was no SU that + // must begin or end current decoder group. + if (NumChecked > 5) { + // If there is an SU which has no cost, return it. + if (Best.noCost()) + break; + // If all SUs that are of about the same height have been + // checked, return the best one. + if ((HighestSU->getHeight() - SU->getHeight()) > 1) + break; + } + } + if (Next == nullptr && Best.SU != nullptr) + Next = Best.SU; + + // HazRec has rejected them all. Start a new decoder group and try + // again. + if (Next == nullptr) { + HazRec->nextGroup(); + return pickNode(IsTopNode); + } + + assert (Next != nullptr && "SU lost?"); + Available.erase(Next); + return Next; +} + +SystemZPostRASchedStrategy::Candidate:: +Candidate(SUnit *SU_, SystemZDecodeGroupHazardRecognizer *HazRec, + bool NegGroupingCost) { + SU = SU_; + + // Check the grouping cost. For a node that must begin / end a + // group, it is positive if it would do so prematurely, or negative + // if it would fit naturally into the schedule. + GroupingCost = HazRec->groupingCost(SU); + // Only look for naturally fitting SUs within a certain + // "look-ahead". After that, it is known that they get a new chance + // since the current decoder group will be completed. + if (GroupingCost < 0 && !NegGroupingCost) + GroupingCost = 0; + + // Check if this would be a second multi cycle into current group. + MultiCycleStall = HazRec->multiCycleStallInGroup(SU); + + // Check the resources cost for this SU + if (!DisableResources) + ResourcesCost = HazRec->resourcesCost(SU); +} + +bool SystemZPostRASchedStrategy::Candidate:: +operator<(const Candidate &other) { + bool IsBetter = true; + + // Check first for decoder grouping + if (GroupingCost < other.GroupingCost) + IsBetter = true; + else if (GroupingCost > other.GroupingCost) + IsBetter = false; + + // Avoid two multicycle ops in same group + else if (!MultiCycleStall && other.MultiCycleStall) + IsBetter = true; + else if (MultiCycleStall && !other.MultiCycleStall) + IsBetter = false; + + // Compare the use of resources + else if (ResourcesCost < other.ResourcesCost) + IsBetter = true; + else if (ResourcesCost > other.ResourcesCost) + IsBetter = false; + + // Higher SU is generally better + else if (SU->getHeight() > other.SU->getHeight()) + IsBetter = true; + else if (SU->getHeight() < other.SU->getHeight()) + IsBetter = false; + + // If all same, keep original order. + else if (SU->NodeNum < other.SU->NodeNum) + IsBetter = true; + else + IsBetter = false; + + // Experimental: DoMyWorst reverses the cost function. + return (!DoMyWorst ? IsBetter : !IsBetter); +} + +void SystemZPostRASchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { + DEBUG(dbgs() << "** Scheduling "; + SU->dump(DAG)); + + // Emit nop(s) to put FPd (blocking multicycle) SU into the next + // decoder group. Scheduler has already moved SU. + if (unsigned NumNops = HazRec->PreEmitNoops(SU)) { + while (NumNops--) { + DAG->TII->insertNoop(*SU->getInstr()->getParent(), SU->getInstr()); + DEBUG(dbgs() << "** Inserting NOOP\n"); + } + } + + HazRec->EmitInstruction(SU); +} + +// Put all released SUs in the Available set. There is no Pending set +// (for nodes which are not ready on the current cycle), since we are +// primarily filling decoder groups, and will put an instruction into +// an available decoder slot, even if it was not ready on that cycle. +void SystemZPostRASchedStrategy::releaseTopNode(SUnit *SU) { + Available.insert(SU); +} + Index: lib/Target/SystemZ/SystemZProcessors.td =================================================================== --- lib/Target/SystemZ/SystemZProcessors.td +++ lib/Target/SystemZ/SystemZProcessors.td @@ -78,17 +78,18 @@ def : Processor<"generic", NoItineraries, []>; def : Processor<"z10", NoItineraries, []>; -def : Processor<"z196", NoItineraries, +def : ProcessorModel<"z196", Z196Model, [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord, FeatureFPExtension, FeaturePopulationCount, FeatureFastSerialization, FeatureInterlockedAccess1]>; -def : Processor<"zEC12", NoItineraries, +def : ProcessorModel<"zEC12", ZEC12Model, [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord, FeatureFPExtension, FeaturePopulationCount, FeatureFastSerialization, FeatureInterlockedAccess1, FeatureMiscellaneousExtensions, FeatureTransactionalExecution, FeatureProcessorAssist]>; -def : Processor<"z13", NoItineraries, + +def : ProcessorModel<"z13", Z13Model, [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord, FeatureFPExtension, FeaturePopulationCount, FeatureFastSerialization, FeatureInterlockedAccess1, Index: lib/Target/SystemZ/SystemZSchedule.td =================================================================== --- /dev/null +++ lib/Target/SystemZ/SystemZSchedule.td @@ -0,0 +1,70 @@ +//==-- SystemZSchedule.td - SystemZ Scheduling Definitions ----*- tblgen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// Scheduler resources + +// These three resources are used to express decoder grouping rules. +def GroupAlone : SchedWrite; +def BeginGroup : SchedWrite; +def EndGroup : SchedWrite; + +// These resources model a decoder group slot and execution unit with +// latency. If more than one of these are used for an instruction, the +// greatest latency will result, and the number of uops will be added, +// although a resource may have 0 NumMicroOps. + +// Fixed-point +def FXU : SchedWrite; +def FXU_2cyc : SchedWrite; +def FXU_3cyc : SchedWrite; +def FXU_4cyc : SchedWrite; +def FXU_5cyc : SchedWrite; +def FXU_6cyc : SchedWrite; +def FXU_7cyc : SchedWrite; +def FXU_8cyc : SchedWrite; +def FXU_9cyc : SchedWrite; +def FXU_15cyc : SchedWrite; +def FXU_20cyc : SchedWrite; +def FXU_30cyc : SchedWrite; + +// Load-store +def LSU : SchedWrite; +def LSU_2cyc : SchedWrite; +def LSU_5cyc : SchedWrite; +def LSU_6cyc : SchedWrite; +def LSU_20cyc : SchedWrite; +def LSU_30cyc : SchedWrite; + +// Vector +// B is defined as a "single pass through pipeline". +def VFU_Bcyc : SchedWrite; +def VFU_Bplus1cyc : SchedWrite; +def VFU_Bplus2cyc : SchedWrite; +def VFU_15cyc : SchedWrite; +def VFU_20cyc : SchedWrite; +def VFU_30cyc : SchedWrite; + +// Blocking BFP div/sqrt unit. +def FPd_30cyc : SchedWrite; + +// Virtual branching unit +def VBU : SchedWrite; + +// Floating point unit (zEC12 and earlier) +def FPU_Bcyc : SchedWrite; +def FPU_Bplus1cyc : SchedWrite; +def FPU_Bplus2cyc : SchedWrite; +def FPU_15cyc : SchedWrite; +def FPU_20cyc : SchedWrite; +def FPU_30cyc : SchedWrite; + +include "SystemZScheduleZ13.td" +include "SystemZScheduleZEC12.td" +include "SystemZScheduleZ196.td" + Index: lib/Target/SystemZ/SystemZScheduleZ13.td =================================================================== --- /dev/null +++ lib/Target/SystemZ/SystemZScheduleZ13.td @@ -0,0 +1,724 @@ +//==-- SystemZSchedule.td - SystemZ Scheduling Definitions ----*- tblgen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Z13 to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def Z13Model : SchedMachineModel { + + let IssueWidth = 6; // 2 * 3 instructions decoded per cycle. + let MicroOpBufferSize = 60; // Issue queues + let MinLatency = 0; // Out-of-order + let LoadLatency = 1; // Optimistic load latency. + + let PostRAScheduler = 1; + + // Extra cycles for a mispredicted branch. + let MispredictPenalty = 8; + + // Max micro-ops that can be buffered for + // optimized loop dispatch/execution. + let LoopMicroOpBufferSize = 12; + + // This model does not include operand specific information. + let CompleteModel = 0; +} + +let SchedModel = Z13Model in { + +// Execution units. BufferSize controls when scheduler will start to +// postpone scheduling of instructions using that particular unit. +def Z13_VBUnit : ProcResource<1>; +def Z13_FXUnit : ProcResource<2> { let BufferSize = 2; /* ooo */ } +def Z13_LSUnit : ProcResource<2> { let BufferSize = 2; /* ooo */ } +def Z13_VFUnit : ProcResource<2> { let BufferSize = 2; /* ooo */ } +def Z13_FPdUnit : ProcResource<2> { let BufferSize = 0; /* blocking */ } + +def : WriteRes { + let NumMicroOps = 0; + let BeginGroup = 1; + let EndGroup = 1; +} + +def : WriteRes { + let NumMicroOps = 0; + let BeginGroup = 1; +} + +def : WriteRes { + let NumMicroOps = 0; + let EndGroup = 1; +} + +// Subtarget specific definitions of scheduling resources. + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 7; } +def : WriteRes { let Latency = 8; } +def : WriteRes { let Latency = 9; } +def : WriteRes { let Latency = 15; } +def : WriteRes { let Latency = 20; } +def : WriteRes { let Latency = 30; } + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 20; } +def : WriteRes { let Latency = 30; } + +def : WriteRes { let Latency = 9; } +def : WriteRes { let Latency = 10; } +def : WriteRes { let Latency = 11; } +def : WriteRes { let Latency = 15; } +def : WriteRes { let Latency = 20; } +def : WriteRes { let Latency = 30; } + +// This should be modelled as using FPd for ~30 cycles, but that seems +// bad since SchedBoundary would consider the FPd stall a global stall +// and increase CurrCycle by 30. +def : WriteRes { let Latency = 30; } + +def : WriteRes; + +// -------------------------- INSTRUCTIONS ---------------------------------- // + +// InstRW constructs have been used in order to preserve the +// readability of the InstrInfo files. + +// For each instruction, as matched by a regexp, provide a list of +// resources that it needs. These will be combined into a SchedClass. + +// Call +def : InstRW<[VBU, FXU_2cyc, FXU, GroupAlone], (instregex "BRAS$")>; +def : InstRW<[FXU_2cyc, FXU, FXU, GroupAlone], (instregex "(Call)?BASR$")>; +def : InstRW<[FXU, EndGroup], (instregex "CallBR$")>; +def : InstRW<[FXU_2cyc, FXU, FXU, GroupAlone], (instregex "(Call)?BRASL$")>; +def : InstRW<[FXU_2cyc, FXU, FXU, GroupAlone], (instregex "TLS_(G|L)DCALL$")>; + +// Return +def : InstRW<[FXU, EndGroup], (instregex "Return$")>; + +// Serialize +def : InstRW<[FXU, EndGroup], (instregex "Serialize$")>; + +///// FIXED POINT + +// Addition +def : InstRW<[FXU], (instregex "A(Y|IH|SI)?$")>; +def : InstRW<[FXU], (instregex "AFI(Mux)?$")>; +def : InstRW<[FXU], (instregex "AG(SI)?$")>; +def : InstRW<[FXU], (instregex "AGFI$")>; +def : InstRW<[FXU], (instregex "AGHI(K)?$")>; +def : InstRW<[FXU], (instregex "AGR(K)?$")>; +def : InstRW<[FXU], (instregex "AHI(K)?$")>; +def : InstRW<[FXU], (instregex "AHIMux(K)?$")>; +def : InstRW<[FXU], (instregex "AL(Y|FI|HSIK)?$")>; +def : InstRW<[FXU], (instregex "ALG(HSIK)?$")>; +def : InstRW<[FXU], (instregex "ALGF(I|R)?$")>; +def : InstRW<[FXU], (instregex "ALGR(K)?$")>; +def : InstRW<[FXU], (instregex "ALR(K)?$")>; +def : InstRW<[FXU], (instregex "AR(K)?$")>; + +// Logical addition with carry +def : InstRW<[FXU_2cyc, GroupAlone], (instregex "ALC(R)?$")>; +def : InstRW<[FXU_2cyc, GroupAlone], (instregex "ALCG(R)?$")>; + +// Add with sign extension (32 -> 64) +def : InstRW<[FXU_2cyc], (instregex "AGF(R)?$")>; + +// Add halfword +def : InstRW<[FXU_2cyc], (instregex "AH(Y)?$")>; + +// Subtraction +def : InstRW<[FXU], (instregex "S(G|Y)?$")>; +def : InstRW<[FXU], (instregex "SGR(K)?$")>; +def : InstRW<[FXU], (instregex "SL(G|Y|FI)?$")>; +def : InstRW<[FXU], (instregex "SLGF(I|R)?$")>; +def : InstRW<[FXU], (instregex "SLGR(K)?$")>; +def : InstRW<[FXU], (instregex "SLL(G|K)?$")>; +def : InstRW<[FXU], (instregex "SLR(K)?$")>; +def : InstRW<[FXU], (instregex "SR(K)?$")>; +def : InstRW<[FXU], (instregex "SRA(G|K)?$")>; +def : InstRW<[FXU], (instregex "SRL(G|K)?$")>; + +// Subtraction with borrow +def : InstRW<[FXU_2cyc, GroupAlone], (instregex "SLB(G|R)?$")>; +def : InstRW<[FXU_2cyc, GroupAlone], (instregex "SLBGR$")>; + +// Subtraction with sign extension (32 -> 64) +def : InstRW<[FXU_2cyc], (instregex "SGF(R)?$")>; + +// Subtract halfword +def : InstRW<[FXU_2cyc], (instregex "SH(Y)?$")>; + +// Multiply +def : InstRW<[FXU_6cyc], (instregex "MS(R|Y|FI)?$")>; +def : InstRW<[FXU_8cyc], (instregex "MSG(R)?$")>; +def : InstRW<[FXU_6cyc], (instregex "MSGF(I|R)?$")>; +def : InstRW<[FXU_9cyc, GroupAlone], (instregex "MLG(R)?$")>; +def : InstRW<[FXU_5cyc], (instregex "MGHI$")>; +def : InstRW<[FXU_5cyc], (instregex "MH(I|Y)?$")>; + +// Divide +def : InstRW<[FXU_30cyc, GroupAlone], (instregex "DSG(F)?R$")>; +def : InstRW<[LSU, FXU_30cyc, GroupAlone], (instregex "DSG(F)?$")>; +def : InstRW<[FXU_20cyc, GroupAlone], (instregex "DLR$")>; +def : InstRW<[FXU_30cyc, GroupAlone], (instregex "DLGR$")>; +def : InstRW<[LSU, FXU_20cyc, GroupAlone], (instregex "DL$")>; +def : InstRW<[LSU, FXU_30cyc, GroupAlone], (instregex "DLG$")>; + +// And +def : InstRW<[FXU], (instregex "N(G|Y|TSTG)?$")>; +def : InstRW<[FXU], (instregex "NGR(K)?$")>; +def : InstRW<[FXU], (instregex "NI(Y|FMux|HMux|LMux)?$")>; +def : InstRW<[FXU], (instregex "NIHF(64)?$")>; +def : InstRW<[FXU], (instregex "NIHH(64)?$")>; +def : InstRW<[FXU], (instregex "NIHL(64)?$")>; +def : InstRW<[FXU], (instregex "NILF(64)?$")>; +def : InstRW<[FXU], (instregex "NILH(64)?$")>; +def : InstRW<[FXU], (instregex "NILL(64)?$")>; +def : InstRW<[FXU], (instregex "NR(K)?$")>; + +// Or +def : InstRW<[FXU], (instregex "O(G|Y)?$")>; +def : InstRW<[FXU], (instregex "OGR(K)?$")>; +def : InstRW<[FXU], (instregex "OI(Y|FMux|HMux|LMux)?$")>; +def : InstRW<[FXU], (instregex "OIHF(64)?$")>; +def : InstRW<[FXU], (instregex "OIHH(64)?$")>; +def : InstRW<[FXU], (instregex "OIHL(64)?$")>; +def : InstRW<[FXU], (instregex "OILF(64)?$")>; +def : InstRW<[FXU], (instregex "OILH(64)?$")>; +def : InstRW<[FXU], (instregex "OILL(64)?$")>; +def : InstRW<[FXU], (instregex "OR(K)?$")>; + +// Xor +def : InstRW<[FXU], (instregex "XI(Y)?$")>; +def : InstRW<[FXU], (instregex "X(G|Y|IFMux)?$")>; +def : InstRW<[FXU], (instregex "XGR(K)?$")>; +def : InstRW<[FXU], (instregex "XIHF(64)?$")>; +def : InstRW<[FXU], (instregex "XILF(64)?$")>; +def : InstRW<[FXU], (instregex "XR(K)?$")>; + +// Insert +def : InstRW<[FXU], (instregex "IC(Y)?$")>; +def : InstRW<[FXU], (instregex "IC32(Y)?$")>; +def : InstRW<[FXU], (instregex "II(F|H|L)Mux$")>; +def : InstRW<[FXU], (instregex "IIHF(64)?$")>; +def : InstRW<[FXU], (instregex "IIHH(64)?$")>; +def : InstRW<[FXU], (instregex "IIHL(64)?$")>; +def : InstRW<[FXU], (instregex "IILF(64)?$")>; +def : InstRW<[FXU], (instregex "IILH(64)?$")>; +def : InstRW<[FXU], (instregex "IILL(64)?$")>; + +// And / Or / Xor character +def : InstRW<[LSU, FXU, BeginGroup], (instregex "NC$")>; +def : InstRW<[LSU, FXU, BeginGroup], (instregex "OC$")>; +def : InstRW<[LSU, FXU, BeginGroup], (instregex "XC$")>; + +// Rotate +def : InstRW<[FXU], (instregex "RLL(G)?$")>; + +// Rotate and insert +def : InstRW<[FXU], (instregex "RISBG(N|32)?$")>; +def : InstRW<[FXU], (instregex "RISBH(G|H|L)$")>; +def : InstRW<[FXU], (instregex "RISBL(G|H|L)$")>; +def : InstRW<[FXU], (instregex "RISBMux$")>; + +// Rotate and Select +def : InstRW<[FXU, FXU_2cyc, BeginGroup], (instregex "R(N|O|X)SBG$")>; + +// Extend +def : InstRW<[FXU], (instregex "AEXT128_64$")>; +def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>; + +// Find leftmost one +def : InstRW<[FXU_6cyc, GroupAlone], (instregex "FLOGR$")>; + +// Population count +def : InstRW<[FXU_3cyc], (instregex "POPCNT$")>; + +// Compare +def : InstRW<[FXU], (instregex "CG$")>; +def : InstRW<[FXU], (instregex "C(G|Y|IH|Mux)?$")>; +def : InstRW<[FXU], (instregex "CFI(Mux)?$")>; +def : InstRW<[FXU], (instregex "CGFI$")>; +def : InstRW<[FXU], (instregex "CGH(I|SI)$")>; +def : InstRW<[FXU], (instregex "CGR(L)?$")>; +def : InstRW<[FXU], (instregex "CH(I|F|SI)$")>; +def : InstRW<[FXU], (instregex "CL(Y|Mux|FHSI)?$")>; +def : InstRW<[FXU], (instregex "CLFI(Mux)?$")>; +def : InstRW<[FXU], (instregex "CLG(HRL|HSI)?$")>; +def : InstRW<[FXU], (instregex "CLGF(I)?$")>; +def : InstRW<[FXU], (instregex "CLGFR(L)?$")>; +def : InstRW<[FXU], (instregex "CLGR(L)?$")>; +def : InstRW<[FXU], (instregex "CLH(F|RL|HSI)$")>; +def : InstRW<[FXU], (instregex "CLI(H|Y)?$")>; +def : InstRW<[FXU], (instregex "CLR(L)?$")>; +def : InstRW<[FXU], (instregex "CR(L)?$")>; + +// Compare halfword +def : InstRW<[FXU_2cyc], (instregex "CH(Y|RL)?$")>; +def : InstRW<[FXU_2cyc], (instregex "CGH(RL)?$")>; +def : InstRW<[FXU, FXU, BeginGroup], (instregex "CHHSI$")>; + +// Compare with sign extension (32 -> 64) +def : InstRW<[FXU_2cyc], (instregex "CGF(R|RL)?$")>; + +// Compare and swap +def : InstRW<[FXU, FXU, GroupAlone], (instregex "CS(G|Y)?$")>; + +// Compare logical character +def : InstRW<[FXU, LSU, BeginGroup], (instregex "CLC$")>; + +// Test under mask +def : InstRW<[FXU], (instregex "TM(Y|HMux|LMux)?$")>; +def : InstRW<[FXU], (instregex "TMHH(64)?$")>; +def : InstRW<[FXU], (instregex "TMHL(64)?$")>; +def : InstRW<[FXU], (instregex "TMLH(64)?$")>; +def : InstRW<[FXU], (instregex "TMLL(64)?$")>; + +// Load and test +def : InstRW<[FXU], (instregex "LT(R)?$")>; +def : InstRW<[FXU], (instregex "LTG(R)?$")>; +def : InstRW<[FXU], (instregex "LTGF(R)?$")>; + +// Moves +def : InstRW<[FXU], (instregex "MVGHI$")>; +def : InstRW<[FXU], (instregex "MVH(I|HI)$")>; +def : InstRW<[FXU], (instregex "MVI(Y)?$")>; + +// Move character +def : InstRW<[LSU_2cyc, LSU, FXU, BeginGroup], (instregex "MVC$")>; + +// Pseudo -> reg move +def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>; +def : InstRW<[FXU], (instregex "EXTRACT_SUBREG$")>; +def : InstRW<[FXU], (instregex "INSERT_SUBREG$")>; +def : InstRW<[FXU], (instregex "REG_SEQUENCE$")>; +def : InstRW<[FXU], (instregex "SUBREG_TO_REG$")>; + +// Loads (LSU) +def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux|CBB)?$")>; +def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>; +def : InstRW<[LSU], (instregex "LG(RL)?$")>; +def : InstRW<[LSU], (instregex "LLC(Mux)?$")>; +def : InstRW<[LSU], (instregex "LLG(C|F|H|FRL|HRL)$")>; +def : InstRW<[LSU], (instregex "LLH(RL|Mux)?$")>; +def : InstRW<[LSU], (instregex "L(X|128)$")>; + +// Loads (FXU) +def : InstRW<[FXU], (instregex "LLCH$")>; +def : InstRW<[FXU], (instregex "LLHH$")>; +def : InstRW<[FXU], (instregex "LLCR(Mux)?$")>; +def : InstRW<[FXU], (instregex "LLG(C|F|H)R$")>; +def : InstRW<[FXU], (instregex "LLHR(Mux)?$")>; +def : InstRW<[FXU], (instregex "LLIH(F|H|L)$")>; +def : InstRW<[FXU], (instregex "LLIL(F|H|L)$")>; +def : InstRW<[FXU], (instregex "LA(Y|RL)?$")>; +def : InstRW<[FXU], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY +def : InstRW<[FXU], (instregex "LAA(G)?$")>; +def : InstRW<[FXU], (instregex "LAAL(G)?$")>; +def : InstRW<[FXU], (instregex "LAN(G)?$")>; +def : InstRW<[FXU], (instregex "LAO(G)?$")>; +def : InstRW<[FXU], (instregex "LAX(G)?$")>; +def : InstRW<[FXU], (instregex "LB(H|R|Mux)?$")>; +def : InstRW<[FXU], (instregex "LGR$")>; +def : InstRW<[FXU], (instregex "LGB(R)?$")>; +def : InstRW<[FXU], (instregex "LGF(I)?$")>; +def : InstRW<[FXU], (instregex "LGFR(L)?$")>; +def : InstRW<[FXU], (instregex "LGH(I)?$")>; +def : InstRW<[FXU], (instregex "LGHR(L)?$")>; +def : InstRW<[FXU], (instregex "LH(H|I|Y|Mux|IMux)?$")>; +def : InstRW<[FXU], (instregex "LHR(L)?$")>; +def : InstRW<[FXU], (instregex "LR(Mux)?$")>; +def : InstRW<[FXU], (instregex "LRV(R)?$")>; +def : InstRW<[FXU], (instregex "LRVG(R)?$")>; + +// Load GR from FPR +def : InstRW<[FXU_3cyc], (instregex "LGDR$")>; + +// Load multiple (estimated average of 5 ops) +def : InstRW<[LSU_5cyc, LSU, LSU, LSU, LSU, GroupAlone], (instregex "LMG$")>; + +// Load Complement / Negative / Positive +def : InstRW<[FXU], (instregex "LC(R|GR)$")>; +def : InstRW<[FXU_2cyc], (instregex "LN(R|GR)$")>; +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "LCGFR$")>; +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "L(N|P)GFR$")>; +def : InstRW<[FXU_2cyc], (instregex "LP(G)?R$")>; + +// Load on condition +def : InstRW<[FXU_2cyc], (instregex "LOC(R)?$")>; +def : InstRW<[FXU_2cyc], (instregex "LOCG(R)?$")>; + +// Stores +def : InstRW<[FXU], (instregex "STG(RL)?$")>; +def : InstRW<[FXU], (instregex "ST(X|128)$")>; +def : InstRW<[FXU], (instregex "STH(H|Y|RL|Mux)?$")>; +def : InstRW<[FXU], (instregex "ST(Y|FH|RL|Mux)?$")>; +def : InstRW<[FXU], (instregex "STC(H|Y|Mux)?$")>; +def : InstRW<[FXU], (instregex "STRV(G)?$")>; + +// Store on condition / CondStore pseudos +def : InstRW<[FXU], (instregex "STOC(G)?$")>; +def : InstRW<[FXU], (instregex "CondStore16(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore16Mux(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore32(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore64(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore8(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore8Mux(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStoreF32(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStoreF64(Inv)?$")>; + +// Store multiple (estimated average of 5 ops) +def : InstRW<[LSU, LSU, FXU_5cyc, FXU, FXU, FXU, FXU, + GroupAlone], (instregex "STMG$")>; + +// Select pseudo +def : InstRW<[FXU], (instregex "Select(32|64|F32|F64|F128|32Mux)$")>; + +// String instructions +def : InstRW<[FXU_30cyc], (instregex "SRST$")>; +def : InstRW<[LSU_30cyc, GroupAlone], (instregex "MVST$")>; +def : InstRW<[LSU_30cyc, GroupAlone], (instregex "CLST$")>; + +///// FLOATING POINT + +// Addition +def : InstRW<[VFU_Bcyc], (instregex "AEB(R)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "ADB(R)?$")>; +def : InstRW<[VFU_Bplus2cyc, GroupAlone], (instregex "AXBR$")>; + +// Subtraction +def : InstRW<[VFU_Bcyc], (instregex "SEB(R)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "SDB(R)?$")>; +def : InstRW<[VFU_Bplus2cyc, GroupAlone], (instregex "SXBR$")>; + +// Multiply +def : InstRW<[VFU_Bcyc], (instregex "MEEB(R)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "MDB(R)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "MDEB(R)?$")>; +def : InstRW<[VFU_Bcyc, GroupAlone], (instregex "MXDB$")>; +def : InstRW<[VFU_Bplus1cyc, GroupAlone], (instregex "MXDBR$")>; +def : InstRW<[VFU_20cyc, GroupAlone], (instregex "MXBR$")>; + +// Multiply and add / subtract +def : InstRW<[VFU_Bcyc, GroupAlone], (instregex "MAEB(R)?$")>; +def : InstRW<[VFU_Bcyc, GroupAlone], (instregex "MSEB(R)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "M(A|S)DBR$")>; +def : InstRW<[VFU_Bcyc, GroupAlone], (instregex "M(A|S)DB$")>; + +// Division +def : InstRW<[FPd_30cyc], (instregex "DEB(R)?$")>; +def : InstRW<[FPd_30cyc], (instregex "DDB(R)?$")>; +def : InstRW<[FPd_30cyc, GroupAlone], (instregex "DXBR$")>; + +// Square root +def : InstRW<[FPd_30cyc], (instregex "SQEB(R)?$")>; +def : InstRW<[FPd_30cyc], (instregex "SQDB(R)?$")>; +def : InstRW<[FPd_30cyc, GroupAlone], (instregex "SQXBR$")>; + +// Convert from fixed / logical +def : InstRW<[FXU, VFU_Bcyc, BeginGroup], (instregex "CE(F|G)BR$")>; +def : InstRW<[FXU, VFU_Bcyc, BeginGroup], (instregex "CD(F|G)BR$")>; +def : InstRW<[FXU, VFU_Bplus2cyc, GroupAlone], (instregex "CX(F|G)BR$")>; +def : InstRW<[FXU, VFU_Bcyc, BeginGroup], (instregex "CEL(F|G)BR$")>; +def : InstRW<[FXU, VFU_Bcyc, BeginGroup], (instregex "CDL(F|G)BR$")>; +def : InstRW<[FXU, VFU_Bplus2cyc, GroupAlone], (instregex "CXL(F|G)BR$")>; + +// Convert to fixed / logical +def : InstRW<[FXU, VFU_Bcyc, BeginGroup], (instregex "CF(E|D)BR$")>; +def : InstRW<[FXU, VFU_Bcyc, BeginGroup], (instregex "CG(E|D)BR$")>; +def : InstRW<[FXU, VFU_Bplus1cyc, BeginGroup], (instregex "C(F|G)XBR$")>; +def : InstRW<[FXU, VFU_Bcyc, GroupAlone], (instregex "CLFEBR$")>; +def : InstRW<[FXU, VFU_Bcyc, BeginGroup], (instregex "CLFDBR$")>; +def : InstRW<[FXU, VFU_Bcyc, BeginGroup], (instregex "CLG(E|D)BR$")>; +def : InstRW<[FXU, VFU_Bplus1cyc, BeginGroup], (instregex "CL(F|G)XBR$")>; + +// Copy sign +def : InstRW<[VFU_Bcyc], (instregex "CPSDRd(d|s)$")>; +def : InstRW<[VFU_Bcyc], (instregex "CPSDRs(d|s)$")>; + +// Compare +def : InstRW<[VFU_Bcyc], (instregex "CEB(R)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "CDB(R)?$")>; +def : InstRW<[VFU_Bplus1cyc, GroupAlone], (instregex "CXBR$")>; + +// Load and Test +def : InstRW<[VFU_Bcyc], (instregex "LT(D|E)BR$")>; +def : InstRW<[VFU_Bcyc], (instregex "LTEBRCompare(_VecPseudo)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "LTDBRCompare(_VecPseudo)?$")>; +def : InstRW<[VFU_Bplus2cyc, GroupAlone], (instregex "LTXBR$")>; +def : InstRW<[VFU_Bplus2cyc, GroupAlone], + (instregex "LTXBRCompare(_VecPseudo)?$")>; + +// Load +def : InstRW<[VFU_Bcyc], (instregex "LE(R|Y)?$")>; +def : InstRW<[FXU], (instregex "LD(R|GR)$")>; +def : InstRW<[FXU_2cyc, FXU, GroupAlone], (instregex "LXR$")>; + +// Load zero +def : InstRW<[FXU], (instregex "LZ(DR|ER)$")>; +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "LZXR$")>; + +// Load Complement / Negative / Positive +def : InstRW<[VFU_Bcyc], (instregex "L(C|N|P)DBR$")>; +def : InstRW<[VFU_Bcyc], (instregex "L(C|N|P)EBR$")>; +def : InstRW<[FXU], (instregex "LCDFR(_32)?$")>; +def : InstRW<[FXU], (instregex "LNDFR(_32)?$")>; +def : InstRW<[FXU], (instregex "LPDFR(_32)?$")>; +def : InstRW<[VFU_Bplus2cyc, GroupAlone], (instregex "L(C|N|P)XBR$")>; + +// Load lengthened +def : InstRW<[VFU_Bcyc], (instregex "LDEB(R)?$")>; +def : InstRW<[VFU_Bcyc, GroupAlone], (instregex "LX(D|E)B$")>; +def : InstRW<[VFU_Bplus1cyc, GroupAlone], (instregex "LX(D|E)BR$")>; + +// Load rounded +def : InstRW<[VFU_Bcyc], (instregex "LEDBR(A)?$")>; +def : InstRW<[VFU_Bplus1cyc], (instregex "LEXBR(A)?$")>; +def : InstRW<[VFU_Bplus1cyc], (instregex "LDXBR(A)?$")>; + +// Load FP integer +def : InstRW<[VFU_Bcyc], (instregex "FIEBR(A)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "FIDBR(A)?$")>; +def : InstRW<[VFU_Bplus2cyc, GroupAlone], (instregex "FIXBR(A)?$")>; + +// Store +def : InstRW<[FXU_3cyc], (instregex "STD(Y)?$")>; +def : InstRW<[FXU_3cyc], (instregex "STE(Y)?$")>; + +///// VECTOR + +// Various +def : InstRW<[VFU_Bcyc], (instregex "VA(B|F|G|H|Q|CQ)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VACC(B|F|G|H|Q|CQ)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VAVG(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VAVGL(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCD(GB|LGB)$")>; +def : InstRW<[VFU_Bcyc], (instregex "WCD(GB|LGB)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCEQB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCEQF(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCEQG(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCEQH(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCGDB$")>; +def : InstRW<[VFU_Bcyc], (instregex "WCGDB$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCHB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCHF(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCHG(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCHH(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCHLB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCHLF(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCHLG(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCHLH(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCKSM$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCLGDB$")>; +def : InstRW<[VFU_Bcyc], (instregex "WCLGDB$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCLZ(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VCTZ(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VEC(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VECL(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VERIM(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VERLL(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VERLLV(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VESL(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VESLV(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VESRA(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VESRAV(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VESRL(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VESRLV(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFA(D|E)B$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFAEBS$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFAEF(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFAEH(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFAEZB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFAEZF(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFAEZH(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFCEDB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "WFCEDB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFCHDB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "WFCHDB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFCHEDB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "WFCHEDB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFEEB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFEEF(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFEEH(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFEEZB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFEEZF(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFEEZH(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFENEB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFENEF(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFENEH(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFENEZB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFENEZF(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFENEZH(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VF(I|M|S)DB$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFL(C|N|P)DB$")>; +def : InstRW<[VFU_Bcyc], (instregex "WFL(C|N|P)DB$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFM(A|S)DB$")>; +def : InstRW<[VFU_Bcyc], (instregex "WFM(A|S)DB$")>; +def : InstRW<[VFU_Bcyc], (instregex "VFTCIDB$")>; +def : InstRW<[VFU_Bcyc], (instregex "WFTCIDB$")>; +def : InstRW<[VFU_Bcyc], (instregex "VGBM$")>; +def : InstRW<[VFU_Bcyc], (instregex "VGFMA(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VGFM(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VGM(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VISTRB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VISTRF(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VISTRH(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VLC(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VL(DE|ED)B$")>; +def : InstRW<[VFU_Bcyc], (instregex "WL(DE|ED)B$")>; +def : InstRW<[VFU_Bcyc], (instregex "VLE(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VLEI(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VLP(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMAE(B|F|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMAH(B|F|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMAL(B|F)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMALE(B|F|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMALH(B|F|H|W)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMALO(B|F|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMAO(B|F|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VME(B|F|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMH(B|F|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VML(B|F)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMLE(B|F|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMLH(B|F|H|W)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMLO(B|F|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMN(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMNL(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMO(B|F|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMRH(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMRL(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMX(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VMXL(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VN(C|O)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VO(NE)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VPDI$")>; +def : InstRW<[VFU_Bcyc], (instregex "VPERM$")>; +def : InstRW<[VFU_Bcyc], (instregex "VPK(F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VPKLSF(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VPKLSG(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VPKLSH(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VPKSF(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VPKSG(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VPKSH(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VPOPCT$")>; +def : InstRW<[VFU_Bcyc], (instregex "VREP(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VREPI(B|F|G|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSB(IQ|CBIQ)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSCBI(B|F|G|H|Q)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSEG(B|F|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VS(F|G|H|Q|EL)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSL(DB)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSRA$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSRL$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSTRCB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSTRCF(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSTRCH(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSTRCZB(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSTRCZF(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSTRCZH(S)?$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSUM(B|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSUMG(F|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VSUMQ(F|G)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VTM$")>; +def : InstRW<[VFU_Bcyc], (instregex "VUPH(B|F|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VUPL(B|F)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VUPLH(B|F|H|W)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VUPLL(B|F|H)$")>; +def : InstRW<[VFU_Bcyc], (instregex "VX$")>; +def : InstRW<[VFU_Bcyc], (instregex "VZERO$")>; +def : InstRW<[VFU_Bcyc], (instregex "WF(A|C|I|K|M|S)DB$")>; + +// Vector divide / square root +def : InstRW<[FPd_30cyc], (instregex "(V|W)FDDB$")>; +def : InstRW<[FPd_30cyc], (instregex "(V|W)FSQDB$")>; + +// Moving between GPR and FPR +def : InstRW<[FXU], (instregex "VLVG(B|F|G|H)$")>; +def : InstRW<[FXU], (instregex "LEFR$")>; // Printed as VLVGF +def : InstRW<[FXU_4cyc], (instregex "VLGV(B|F|G|H)$")>; +def : InstRW<[FXU_4cyc], (instregex "LFER$")>; // Printed as VLGVF +def : InstRW<[FXU_2cyc], (instregex "VLVGP(32)?$")>; + +// Load +def : InstRW<[LSU_2cyc], (instregex "VL(L|BB)?$")>; +def : InstRW<[LSU], (instregex "VL(32|64)$")>; +def : InstRW<[LSU], (instregex "VLLEZ(B|F|G|H)$")>; +def : InstRW<[LSU], (instregex "VLREP(B|F|G|H)$")>; +def : InstRW<[FXU], (instregex "VLR(32|64)?$")>; + +// Store +def : InstRW<[FXU_4cyc], (instregex "VST(L|32|64)?$")>; +def : InstRW<[FXU_4cyc], (instregex "VSTE(F|G)$")>; +def : InstRW<[VFU_Bcyc, FXU, BeginGroup], (instregex "VSTE(B|H)$")>; + +// Load / store multiple +def : InstRW<[LSU_6cyc, LSU, LSU, LSU, LSU, GroupAlone], + (instregex "VLM$")>; +def : InstRW<[LSU, LSU, FXU_8cyc, FXU, FXU, FXU, FXU, GroupAlone], + (instregex "VSTM$")>; + +// Byte instructions +def : InstRW<[VFU_Bplus1cyc], (instregex "VSLB$")>; +def : InstRW<[VFU_Bplus1cyc], (instregex "VSRAB$")>; +def : InstRW<[VFU_Bplus1cyc], (instregex "VSRLB$")>; + +// Gather / scatter +def : InstRW<[FXU, VFU_Bcyc, BeginGroup], (instregex "VGE(F|G)$")>; +def : InstRW<[FXU_5cyc, FXU, BeginGroup], (instregex "VSCE(F|G)$")>; + +///// INLINE ASSEMBLY + +def : InstRW<[LSU, LSU, FXU_2cyc, FXU, FXU, BeginGroup], (instregex "STCK(F)?$")>; +def : InstRW<[LSU, LSU, LSU, FXU_3cyc, FXU, FXU, FXU, BeginGroup], + (instregex "STCKE$")>; +def : InstRW<[FXU], (instregex "STFLE$")>; + +///// OTHER + +// Extract Transaction Nesting Depth +def : InstRW<[FXU], (instregex "ETND$")>; + +// Transaction begin +def : InstRW<[LSU, LSU, FXU_5cyc, FXU, FXU, FXU, FXU, GroupAlone], + (instregex "TBEGIN(C|_nofloat)?$")>; + +// Transaction end +def : InstRW<[FXU, GroupAlone], (instregex "TEND$")>; + +// Transaction abort +def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>; + +// Load the Global Offset Table address +def : InstRW<[FXU], (instregex "GOT$")>; + +// Prefetch data +def : InstRW<[LSU], (instregex "PFD(RL)?$")>; + +// Extract access register +def : InstRW<[LSU], (instregex "EAR$")>; + +// Insert Program Mask +def : InstRW<[FXU_3cyc, EndGroup], (instregex "IPM$")>; + +// Processor assist +def : InstRW<[FXU], (instregex "PPA$")>; + +} + Index: lib/Target/SystemZ/SystemZScheduleZ196.td =================================================================== --- /dev/null +++ lib/Target/SystemZ/SystemZScheduleZ196.td @@ -0,0 +1,520 @@ +//==-- SystemZSchedule.td - SystemZ Scheduling Definitions ----*- tblgen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Z196 to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def Z196Model : SchedMachineModel { + + let IssueWidth = 3; // 3 instructions decoded per cycle. + let MicroOpBufferSize = 40; // Issue queues + let MinLatency = 0; // Out-of-order + let LoadLatency = 1; // Optimistic load latency. + + let PostRAScheduler = 1; + + // Extra cycles for a mispredicted branch. + let MispredictPenalty = 8; + + // Max micro-ops that can be buffered for + // optimized loop dispatch/execution. + let LoopMicroOpBufferSize = 12; + + // This model does not include operand specific information. + let CompleteModel = 0; +} + +let SchedModel = Z196Model in { + +// Execution units. BufferSize controls when scheduler will start to +// postpone scheduling of instructions using that particular unit. +def Z196_FXUnit : ProcResource<1> { let BufferSize = 2; /* ooo */ } +def Z196_LSUnit : ProcResource<1> { let BufferSize = 2; /* ooo */ } +def Z196_FPUnit : ProcResource<1> { let BufferSize = 2; /* ooo */ } + +def : WriteRes { + let NumMicroOps = 0; + let BeginGroup = 1; + let EndGroup = 1; +} + +def : WriteRes { + let NumMicroOps = 0; + let BeginGroup = 1; +} + +def : WriteRes { + let NumMicroOps = 0; + let EndGroup = 1; +} + +// Subtarget specific definitions of scheduling resources. + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 7; } +def : WriteRes { let Latency = 8; } +def : WriteRes { let Latency = 9; } +def : WriteRes { let Latency = 15; } +def : WriteRes { let Latency = 20; } +def : WriteRes { let Latency = 30; } + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 20; } +def : WriteRes { let Latency = 30; } + +def : WriteRes { let Latency = 8; } +def : WriteRes { let Latency = 9; } +def : WriteRes { let Latency = 10; } +def : WriteRes { let Latency = 15; } +def : WriteRes { let Latency = 20; } +def : WriteRes { let Latency = 30; } + +// -------------------------- INSTRUCTIONS ---------------------------------- // + +// InstRW constructs have been used in order to preserve the +// readability of the InstrInfo files. + +// For each instruction, as matched by a regexp, provide a list of +// resources that it needs. These will be combined into a SchedClass. + +// Call +def : InstRW<[LSU, FXU_2cyc, FXU, GroupAlone], (instregex "BRAS$")>; +def : InstRW<[FXU_2cyc, FXU, LSU, GroupAlone], (instregex "(Call)?BASR$")>; +def : InstRW<[LSU, EndGroup], (instregex "CallBR$")>; +def : InstRW<[LSU, FXU_2cyc, FXU, GroupAlone], (instregex "(Call)?BRASL$")>; +def : InstRW<[LSU, FXU_2cyc, FXU, GroupAlone], (instregex "TLS_(G|L)DCALL$")>; + +// Return +def : InstRW<[LSU, EndGroup], (instregex "Return$")>; + +// Serialize +def : InstRW<[LSU, EndGroup], (instregex "Serialize$")>; + +///// FIXED POINT + +// Addition +def : InstRW<[FXU], (instregex "A(Y|IH|SI)?$")>; +def : InstRW<[FXU], (instregex "AFI(Mux)?$")>; +def : InstRW<[FXU], (instregex "AG(SI)?$")>; +def : InstRW<[FXU], (instregex "AGFI$")>; +def : InstRW<[FXU], (instregex "AGHI(K)?$")>; +def : InstRW<[FXU], (instregex "AGR(K)?$")>; +def : InstRW<[FXU], (instregex "AHI(K)?$")>; +def : InstRW<[FXU], (instregex "AHIMux(K)?$")>; +def : InstRW<[FXU], (instregex "AL(Y|FI|HSIK)?$")>; +def : InstRW<[FXU], (instregex "ALG(HSIK)?$")>; +def : InstRW<[FXU], (instregex "ALGF(I|R)?$")>; +def : InstRW<[FXU], (instregex "ALGR(K)?$")>; +def : InstRW<[FXU], (instregex "ALR(K)?$")>; +def : InstRW<[FXU], (instregex "AR(K)?$")>; + +// Logical addition with carry +def : InstRW<[FXU_3cyc, GroupAlone], (instregex "ALC(R)?$")>; +def : InstRW<[FXU_3cyc, GroupAlone], (instregex "ALCG(R)?$")>; + +// Add with sign extension (32 -> 64) +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "AGF(R)?$")>; + +// Add halfword +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "AH(Y)?$")>; + +// Subtraction +def : InstRW<[FXU], (instregex "S(G|Y)?$")>; +def : InstRW<[FXU], (instregex "SGR(K)?$")>; +def : InstRW<[FXU], (instregex "SL(G|Y|FI)?$")>; +def : InstRW<[FXU], (instregex "SLGF(I|R)?$")>; +def : InstRW<[FXU], (instregex "SLGR(K)?$")>; +def : InstRW<[FXU], (instregex "SLL(G|K)?$")>; +def : InstRW<[FXU], (instregex "SLR(K)?$")>; +def : InstRW<[FXU], (instregex "SR(K)?$")>; +def : InstRW<[FXU], (instregex "SRA(G|K)?$")>; +def : InstRW<[FXU], (instregex "SRL(G|K)?$")>; + +// Subtraction with borrow +def : InstRW<[FXU_3cyc, GroupAlone], (instregex "SLB(G|R)?$")>; +def : InstRW<[FXU_3cyc, GroupAlone], (instregex "SLBGR$")>; + +// Subtraction with sign extension (32 -> 64) +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "SGF(R)?$")>; + +// Subtract halfword +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "SH(Y)?$")>; + +// Multiply +def : InstRW<[FXU_6cyc], (instregex "MS(R|Y|FI)?$")>; +def : InstRW<[FXU_8cyc], (instregex "MSG(R)?$")>; +def : InstRW<[FXU_6cyc], (instregex "MSGF(I|R)?$")>; +def : InstRW<[FXU_9cyc, GroupAlone], (instregex "MLG(R)?$")>; +def : InstRW<[FXU_5cyc], (instregex "MGHI$")>; +def : InstRW<[FXU_5cyc], (instregex "MH(I|Y)?$")>; + +// Divide +def : InstRW<[FPU_30cyc, FXU, FXU, FXU, FXU, GroupAlone], + (instregex "DSG(F)?R$")>; +def : InstRW<[FPU_30cyc, LSU, FXU, FXU, FXU, GroupAlone], + (instregex "DSG(F)?$")>; +def : InstRW<[FPU_30cyc, FXU, FXU, FXU, FXU, FXU, GroupAlone], + (instregex "DL(G)?R$")>; +def : InstRW<[FPU_30cyc, LSU, FXU, FXU, FXU, FXU, GroupAlone], + (instregex "DL$")>; +def : InstRW<[FPU_30cyc, LSU, FXU, FXU, FXU, FXU, GroupAlone], + (instregex "DLG$")>; + +// And +def : InstRW<[FXU], (instregex "N(G|Y)?$")>; +def : InstRW<[FXU], (instregex "NGR(K)?$")>; +def : InstRW<[FXU], (instregex "NI(Y|FMux|HMux|LMux)?$")>; +def : InstRW<[FXU], (instregex "NIHF(64)?$")>; +def : InstRW<[FXU], (instregex "NIHH(64)?$")>; +def : InstRW<[FXU], (instregex "NIHL(64)?$")>; +def : InstRW<[FXU], (instregex "NILF(64)?$")>; +def : InstRW<[FXU], (instregex "NILH(64)?$")>; +def : InstRW<[FXU], (instregex "NILL(64)?$")>; +def : InstRW<[FXU], (instregex "NR(K)?$")>; + +// Or +def : InstRW<[FXU], (instregex "O(G|Y)?$")>; +def : InstRW<[FXU], (instregex "OGR(K)?$")>; +def : InstRW<[FXU], (instregex "OI(Y|FMux|HMux|LMux)?$")>; +def : InstRW<[FXU], (instregex "OIHF(64)?$")>; +def : InstRW<[FXU], (instregex "OIHH(64)?$")>; +def : InstRW<[FXU], (instregex "OIHL(64)?$")>; +def : InstRW<[FXU], (instregex "OILF(64)?$")>; +def : InstRW<[FXU], (instregex "OILH(64)?$")>; +def : InstRW<[FXU], (instregex "OILL(64)?$")>; +def : InstRW<[FXU], (instregex "OR(K)?$")>; + +// Xor +def : InstRW<[FXU], (instregex "XI(Y)?$")>; +def : InstRW<[FXU], (instregex "X(G|Y|IFMux)?$")>; +def : InstRW<[FXU], (instregex "XGR(K)?$")>; +def : InstRW<[FXU], (instregex "XIHF(64)?$")>; +def : InstRW<[FXU], (instregex "XILF(64)?$")>; +def : InstRW<[FXU], (instregex "XR(K)?$")>; + +// Insert +def : InstRW<[FXU], (instregex "IC(Y)?$")>; +def : InstRW<[FXU], (instregex "IC32(Y)?$")>; +def : InstRW<[FXU], (instregex "II(F|H|L)Mux$")>; +def : InstRW<[FXU], (instregex "IIHF(64)?$")>; +def : InstRW<[FXU], (instregex "IIHH(64)?$")>; +def : InstRW<[FXU], (instregex "IIHL(64)?$")>; +def : InstRW<[FXU], (instregex "IILF(64)?$")>; +def : InstRW<[FXU], (instregex "IILH(64)?$")>; +def : InstRW<[FXU], (instregex "IILL(64)?$")>; + +// And / Or / Xor character +def : InstRW<[LSU, FXU, BeginGroup], (instregex "NC$")>; +def : InstRW<[LSU, FXU, BeginGroup], (instregex "OC$")>; +def : InstRW<[LSU, FXU, BeginGroup], (instregex "XC$")>; + +// Rotate +def : InstRW<[FXU], (instregex "RLL(G)?$")>; + +// Rotate and insert +def : InstRW<[FXU], (instregex "RISBG(32)?$")>; +def : InstRW<[FXU], (instregex "RISBH(G|H|L)$")>; +def : InstRW<[FXU], (instregex "RISBL(G|H|L)$")>; +def : InstRW<[FXU], (instregex "RISBMux$")>; + +// Rotate and Select +def : InstRW<[FXU, FXU_2cyc, GroupAlone], (instregex "R(N|O|X)SBG$")>; + +// Extend +def : InstRW<[FXU], (instregex "AEXT128_64$")>; +def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>; + +// Find leftmost one +def : InstRW<[FXU_7cyc, GroupAlone], (instregex "FLOGR$")>; + +// Population count +def : InstRW<[FXU_3cyc], (instregex "POPCNT$")>; + +// Compare +def : InstRW<[FXU], (instregex "CG$")>; +def : InstRW<[FXU], (instregex "C(G|Y|IH|Mux)?$")>; +def : InstRW<[FXU], (instregex "CFI(Mux)?$")>; +def : InstRW<[FXU], (instregex "CGFI$")>; +def : InstRW<[FXU], (instregex "CGH(I|SI)$")>; +def : InstRW<[FXU], (instregex "CGR(L)?$")>; +def : InstRW<[FXU], (instregex "CH(I|F|SI)$")>; +def : InstRW<[FXU], (instregex "CL(Y|Mux|FHSI)?$")>; +def : InstRW<[FXU], (instregex "CLFI(Mux)?$")>; +def : InstRW<[FXU], (instregex "CLG(HRL|HSI)?$")>; +def : InstRW<[FXU], (instregex "CLGF(I)?$")>; +def : InstRW<[FXU], (instregex "CLGFR(L)?$")>; +def : InstRW<[FXU], (instregex "CLGR(L)?$")>; +def : InstRW<[FXU], (instregex "CLH(F|RL|HSI)$")>; +def : InstRW<[FXU], (instregex "CLI(H|Y)?$")>; +def : InstRW<[FXU], (instregex "CLR(L)?$")>; +def : InstRW<[FXU], (instregex "CR(L)?$")>; + +// Compare halfword +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "CH(Y|RL)?$")>; +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "CGH(RL)?$")>; +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "CHHSI$")>; + +// Compare with sign extension (32 -> 64) +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "CGF(R|RL)?$")>; + +// Compare and swap +def : InstRW<[FXU, FXU, BeginGroup], (instregex "CS(G|Y)?$")>; + +// Compare logical character +def : InstRW<[LSU, FXU, BeginGroup], (instregex "CLC$")>; + +// Test under mask +def : InstRW<[FXU], (instregex "TM(Y|HMux|LMux)?$")>; +def : InstRW<[FXU], (instregex "TMHH(64)?$")>; +def : InstRW<[FXU], (instregex "TMHL(64)?$")>; +def : InstRW<[FXU], (instregex "TMLH(64)?$")>; +def : InstRW<[FXU], (instregex "TMLL(64)?$")>; + +// Load and test +def : InstRW<[FXU], (instregex "LT(R)?$")>; +def : InstRW<[FXU], (instregex "LTG(R)?$")>; +def : InstRW<[FXU], (instregex "LTGF(R)?$")>; + +// Moves +def : InstRW<[FXU], (instregex "MVGHI$")>; +def : InstRW<[FXU], (instregex "MVH(I|HI)$")>; +def : InstRW<[FXU], (instregex "MVI(Y)?$")>; + +// Move character +def : InstRW<[LSU_2cyc, LSU, FXU, BeginGroup], (instregex "MVC$")>; + +// Pseudo -> reg move +def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>; +def : InstRW<[FXU], (instregex "EXTRACT_SUBREG$")>; +def : InstRW<[FXU], (instregex "INSERT_SUBREG$")>; +def : InstRW<[FXU], (instregex "REG_SEQUENCE$")>; +def : InstRW<[FXU], (instregex "SUBREG_TO_REG$")>; + +// Loads (LSU) +def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux)?$")>; +def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>; +def : InstRW<[LSU], (instregex "LG(RL)?$")>; +def : InstRW<[LSU], (instregex "LLC(Mux)?$")>; +def : InstRW<[LSU], (instregex "LLG(C|F|H|FRL|HRL)$")>; +def : InstRW<[LSU], (instregex "LLH(RL|Mux)?$")>; +def : InstRW<[LSU], (instregex "L(X|128)$")>; + +// Loads (FXU) +def : InstRW<[FXU], (instregex "LLCH$")>; +def : InstRW<[FXU], (instregex "LLHH$")>; +def : InstRW<[FXU], (instregex "LLCR(Mux)?$")>; +def : InstRW<[FXU], (instregex "LLG(C|F|H)R$")>; +def : InstRW<[FXU], (instregex "LLHR(Mux)?$")>; +def : InstRW<[FXU], (instregex "LLIH(F|H|L)$")>; +def : InstRW<[FXU], (instregex "LLIL(F|H|L)$")>; +def : InstRW<[FXU], (instregex "LA(Y|RL)?$")>; +def : InstRW<[FXU], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY +def : InstRW<[FXU], (instregex "LAA(G)?$")>; +def : InstRW<[FXU], (instregex "LAAL(G)?$")>; +def : InstRW<[FXU], (instregex "LAN(G)?$")>; +def : InstRW<[FXU], (instregex "LAO(G)?$")>; +def : InstRW<[FXU], (instregex "LAX(G)?$")>; +def : InstRW<[FXU], (instregex "LB(H|R|Mux)?$")>; +def : InstRW<[FXU], (instregex "LGR$")>; +def : InstRW<[FXU], (instregex "LGB(R)?$")>; +def : InstRW<[FXU], (instregex "LGF(I)?$")>; +def : InstRW<[FXU], (instregex "LGFR(L)?$")>; +def : InstRW<[FXU], (instregex "LGH(I)?$")>; +def : InstRW<[FXU], (instregex "LGHR(L)?$")>; +def : InstRW<[FXU], (instregex "LH(H|I|Y|Mux|IMux)?$")>; +def : InstRW<[FXU], (instregex "LHR(L)?$")>; +def : InstRW<[FXU], (instregex "LR(Mux)?$")>; +def : InstRW<[FXU], (instregex "LRV(R)?$")>; +def : InstRW<[FXU], (instregex "LRVG(R)?$")>; + +// Load GR from FPR +def : InstRW<[FXU_3cyc], (instregex "LGDR$")>; + +// Load multiple (estimated average of 5 ops) +def : InstRW<[LSU_5cyc, LSU, LSU, LSU, LSU, BeginGroup], (instregex "LMG$")>; + +// Load Complement / Negative / Positive +def : InstRW<[FXU], (instregex "LC(R|GR)$")>; +def : InstRW<[FXU_2cyc], (instregex "LN(R|GR)$")>; +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "LCGFR$")>; +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "L(N|P)GFR$")>; +def : InstRW<[FXU_2cyc], (instregex "LP(G)?R$")>; + +// Load on condition +def : InstRW<[FXU_2cyc, EndGroup], (instregex "LOC(R)?$")>; +def : InstRW<[FXU_2cyc, EndGroup], (instregex "LOCG(R)?$")>; + +// Stores +def : InstRW<[FXU], (instregex "STG(RL)?$")>; +def : InstRW<[FXU], (instregex "ST(X|128)$")>; +def : InstRW<[FXU], (instregex "STH(H|Y|RL|Mux)?$")>; +def : InstRW<[FXU], (instregex "ST(Y|FH|RL|Mux)?$")>; +def : InstRW<[FXU], (instregex "STC(H|Y|Mux)?$")>; +def : InstRW<[FXU], (instregex "STRV(G)?$")>; + +// Store on condition / CondStore pseudos +def : InstRW<[FXU, EndGroup], (instregex "STOC(G)?$")>; +def : InstRW<[FXU], (instregex "CondStore16(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore16Mux(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore32(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore64(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore8(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore8Mux(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStoreF32(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStoreF64(Inv)?$")>; + +// Store multiple (estimated average of 5 ops) +def : InstRW<[LSU, LSU, FXU_5cyc, FXU, FXU, FXU, FXU, GroupAlone], + (instregex "STMG$")>; + +// Select pseudo +def : InstRW<[FXU], (instregex "Select(32|64|F32|F64|F128|32Mux)$")>; + +// String instructions +def : InstRW<[FXU_30cyc], (instregex "SRST$")>; +def : InstRW<[LSU_30cyc, GroupAlone], (instregex "MVST$")>; +def : InstRW<[LSU_30cyc, GroupAlone], (instregex "CLST$")>; + +///// FLOATING POINT + +// Addition +def : InstRW<[FPU_Bcyc], (instregex "AEB(R)?$")>; +def : InstRW<[FPU_Bcyc], (instregex "ADB(R)?$")>; +def : InstRW<[FPU_20cyc, GroupAlone], (instregex "AXBR$")>; + +// Subtraction +def : InstRW<[FPU_Bcyc], (instregex "SEB(R)?$")>; +def : InstRW<[FPU_Bcyc], (instregex "SDB(R)?$")>; +def : InstRW<[FPU_20cyc, GroupAlone], (instregex "SXBR$")>; + +// Multiply +def : InstRW<[FPU_Bcyc], (instregex "MEEB(R)?$")>; +def : InstRW<[FPU_Bcyc], (instregex "MDB(R)?$")>; +def : InstRW<[FPU_Bcyc], (instregex "MDEB(R)?$")>; +def : InstRW<[FPU_Bplus2cyc, GroupAlone], (instregex "MXDB(R)?$")>; +def : InstRW<[FPU_30cyc, GroupAlone], (instregex "MXBR$")>; + +// Multiply and add / subtract +def : InstRW<[FPU_Bcyc, GroupAlone], (instregex "MAEB(R)?$")>; +def : InstRW<[FPU_Bcyc, GroupAlone], (instregex "MSEB(R)?$")>; +def : InstRW<[FPU_Bcyc, GroupAlone], (instregex "M(A|S)DBR$")>; +def : InstRW<[FPU_Bcyc, GroupAlone], (instregex "M(A|S)DB$")>; + +// Division +def : InstRW<[FPU_30cyc], (instregex "DEB(R)?$")>; +def : InstRW<[FPU_30cyc], (instregex "DDB(R)?$")>; +def : InstRW<[FPU_30cyc, GroupAlone], (instregex "DXBR$")>; + +// Square root +def : InstRW<[FPU_30cyc], (instregex "SQEB(R)?$")>; +def : InstRW<[FPU_30cyc], (instregex "SQDB(R)?$")>; +def : InstRW<[FPU_30cyc, GroupAlone], (instregex "SQXBR$")>; + +// Convert from fixed / logical +def : InstRW<[FXU, FPU_Bcyc, GroupAlone], (instregex "CE(F|G)BR$")>; +def : InstRW<[FXU, FPU_Bcyc, GroupAlone], (instregex "CD(F|G)BR$")>; +def : InstRW<[FXU, FPU_Bplus2cyc, GroupAlone], (instregex "CX(F|G)BR$")>; +def : InstRW<[FXU, FPU_Bcyc, BeginGroup], (instregex "CEL(F|G)BR$")>; +def : InstRW<[FXU, FPU_Bcyc, BeginGroup], (instregex "CDL(F|G)BR$")>; +def : InstRW<[FXU, FPU_Bplus2cyc, GroupAlone], (instregex "CXL(F|G)BR$")>; + +// Convert to fixed / logical +def : InstRW<[FXU, FPU_Bplus1cyc, GroupAlone], (instregex "CF(E|D|X)BR$")>; +def : InstRW<[FXU, FPU_Bplus1cyc, GroupAlone], (instregex "CG(E|D|X)BR$")>; +def : InstRW<[FXU, FPU_Bcyc, BeginGroup], (instregex "CLF(E|D)BR$")>; +def : InstRW<[FXU, FPU_Bcyc, GroupAlone], (instregex "CLG(E|D)BR$")>; +def : InstRW<[FXU, FPU_Bplus1cyc, BeginGroup], (instregex "CL(F|G)XBR$")>; + +// Copy sign +def : InstRW<[FXU, FXU_3cyc, BeginGroup], (instregex "CPSDRd(d|s)$")>; +def : InstRW<[FXU, FXU_3cyc, BeginGroup], (instregex "CPSDRs(d|s)$")>; + +// Compare +def : InstRW<[FPU_Bcyc], (instregex "CEB(R)?$")>; +def : InstRW<[FPU_Bcyc], (instregex "CDB(R)?$")>; +def : InstRW<[FPU_30cyc], (instregex "CXBR$")>; + +// Load and Test +def : InstRW<[FPU_Bcyc], (instregex "LT(D|E)BR$")>; +def : InstRW<[FPU_Bcyc], (instregex "LTEBRCompare(_VecPseudo)?$")>; +def : InstRW<[FPU_Bcyc], (instregex "LTDBRCompare(_VecPseudo)?$")>; +def : InstRW<[FPU_Bplus1cyc, GroupAlone], (instregex "LTXBR$")>; +def : InstRW<[FPU_Bplus1cyc, GroupAlone], + (instregex "LTXBRCompare(_VecPseudo)?$")>; + +// Load +def : InstRW<[LSU], (instregex "LE(Y)?$")>; +def : InstRW<[FXU], (instregex "LER$")>; +def : InstRW<[FXU], (instregex "LD(R|GR)$")>; +def : InstRW<[FXU_2cyc, FXU, GroupAlone], (instregex "LXR$")>; + +// Load zero +def : InstRW<[FXU], (instregex "LZ(DR|ER)$")>; +def : InstRW<[FXU_2cyc, FXU, GroupAlone], (instregex "LZXR$")>; + +// Load Complement / Negative / Positive +def : InstRW<[FPU_Bcyc], (instregex "L(C|N|P)DBR$")>; +def : InstRW<[FPU_Bcyc], (instregex "L(C|N|P)EBR$")>; +def : InstRW<[FXU], (instregex "LCDFR(_32)?$")>; +def : InstRW<[FXU], (instregex "LNDFR(_32)?$")>; +def : InstRW<[FXU], (instregex "LPDFR(_32)?$")>; +def : InstRW<[FPU_Bplus1cyc, GroupAlone], (instregex "L(C|N|P)XBR$")>; + +// Load lengthened +def : InstRW<[FPU_Bcyc], (instregex "LDEB(R)?$")>; +def : InstRW<[FPU_Bplus2cyc, GroupAlone], (instregex "LX(D|E)B$")>; +def : InstRW<[FPU_Bplus2cyc, GroupAlone], (instregex "LX(D|E)BR$")>; + +// Load rounded +def : InstRW<[FPU_Bcyc], (instregex "LEDBR(A)?$")>; +def : InstRW<[FPU_Bplus2cyc], (instregex "LEXBR(A)?$")>; +def : InstRW<[FPU_Bplus2cyc], (instregex "LDXBR(A)?$")>; + +// Load FP integer +def : InstRW<[FPU_Bcyc], (instregex "FIEBR(A)?$")>; +def : InstRW<[FPU_Bcyc], (instregex "FIDBR(A)?$")>; +def : InstRW<[FPU_15cyc, GroupAlone], (instregex "FIXBR(A)?$")>; + +// Store +def : InstRW<[FXU_3cyc], (instregex "STD(Y)?$")>; +def : InstRW<[FXU_3cyc], (instregex "STE(Y)?$")>; + +///// INLINE ASSEMBLY + +def : InstRW<[FXU_8cyc], (instregex "STCKF$")>; +def : InstRW<[FXU_15cyc], (instregex "STCK$")>; +def : InstRW<[FXU_20cyc], (instregex "STCKE$")>; +def : InstRW<[FXU], (instregex "STFLE$")>; + +///// OTHER + +// Load the Global Offset Table address +def : InstRW<[FXU], (instregex "GOT$")>; + +// Prefetch data +def : InstRW<[LSU, GroupAlone], (instregex "PFD(RL)?$")>; + +// Extract access register +def : InstRW<[LSU], (instregex "EAR$")>; + +// Insert Program Mask +def : InstRW<[FXU_3cyc, EndGroup], (instregex "IPM$")>; + +} + Index: lib/Target/SystemZ/SystemZScheduleZEC12.td =================================================================== --- /dev/null +++ lib/Target/SystemZ/SystemZScheduleZEC12.td @@ -0,0 +1,535 @@ +//==-- SystemZSchedule.td - SystemZ Scheduling Definitions ----*- tblgen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for ZEC12 to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def ZEC12Model : SchedMachineModel { + + let IssueWidth = 3; // 3 instructions decoded per cycle. + let MicroOpBufferSize = 40; // Issue queues + let MinLatency = 0; // Out-of-order + let LoadLatency = 1; // Optimistic load latency. + + let PostRAScheduler = 1; + + // Extra cycles for a mispredicted branch. + let MispredictPenalty = 8; + + // Max micro-ops that can be buffered for + // optimized loop dispatch/execution. + let LoopMicroOpBufferSize = 12; + + // This model does not include operand specific information. + let CompleteModel = 0; +} + +let SchedModel = ZEC12Model in { + +// Execution units. BufferSize controls when scheduler will start to +// postpone scheduling of instructions using that particular unit. +def ZEC12_VBUnit : ProcResource<1>; +def ZEC12_FXUnit : ProcResource<1> { let BufferSize = 2; /* ooo */ } +def ZEC12_LSUnit : ProcResource<1> { let BufferSize = 2; /* ooo */ } +def ZEC12_FPUnit : ProcResource<1> { let BufferSize = 2; /* ooo */ } + +def : WriteRes { + let NumMicroOps = 0; + let BeginGroup = 1; + let EndGroup = 1; +} + +def : WriteRes { + let NumMicroOps = 0; + let BeginGroup = 1; +} + +def : WriteRes { + let NumMicroOps = 0; + let EndGroup = 1; +} + +// Subtarget specific definitions of scheduling resources. + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 7; } +def : WriteRes { let Latency = 8; } +def : WriteRes { let Latency = 9; } +def : WriteRes { let Latency = 15; } +def : WriteRes { let Latency = 20; } +def : WriteRes { let Latency = 30; } + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 20; } +def : WriteRes { let Latency = 30; } + +def : WriteRes { let Latency = 8; } +def : WriteRes { let Latency = 9; } +def : WriteRes { let Latency = 10; } +def : WriteRes { let Latency = 15; } +def : WriteRes { let Latency = 20; } +def : WriteRes { let Latency = 30; } + +def : WriteRes; + +// -------------------------- INSTRUCTIONS ---------------------------------- // + +// InstRW constructs have been used in order to preserve the +// readability of the InstrInfo files. + +// For each instruction, as matched by a regexp, provide a list of +// resources that it needs. These will be combined into a SchedClass. + +// Call +def : InstRW<[VBU, FXU_2cyc, FXU, GroupAlone], (instregex "BRAS$")>; +def : InstRW<[FXU_2cyc, FXU, LSU, GroupAlone], (instregex "(Call)?BASR$")>; +def : InstRW<[LSU, EndGroup], (instregex "CallBR$")>; +def : InstRW<[LSU, FXU_2cyc, FXU, GroupAlone], (instregex "(Call)?BRASL$")>; +def : InstRW<[LSU, FXU_2cyc, FXU, GroupAlone], (instregex "TLS_(G|L)DCALL$")>; + + +// Return +def : InstRW<[LSU, EndGroup], (instregex "Return$")>; + +// Serialize +def : InstRW<[LSU, EndGroup], (instregex "Serialize$")>; + +///// FIXED POINT + +// Addition +def : InstRW<[FXU], (instregex "A(Y|IH|SI)?$")>; +def : InstRW<[FXU], (instregex "AFI(Mux)?$")>; +def : InstRW<[FXU], (instregex "AG(SI)?$")>; +def : InstRW<[FXU], (instregex "AGFI$")>; +def : InstRW<[FXU], (instregex "AGHI(K)?$")>; +def : InstRW<[FXU], (instregex "AGR(K)?$")>; +def : InstRW<[FXU], (instregex "AHI(K)?$")>; +def : InstRW<[FXU], (instregex "AHIMux(K)?$")>; +def : InstRW<[FXU], (instregex "AL(Y|FI|HSIK)?$")>; +def : InstRW<[FXU], (instregex "ALG(HSIK)?$")>; +def : InstRW<[FXU], (instregex "ALGF(I|R)?$")>; +def : InstRW<[FXU], (instregex "ALGR(K)?$")>; +def : InstRW<[FXU], (instregex "ALR(K)?$")>; +def : InstRW<[FXU], (instregex "AR(K)?$")>; + +// Logical addition with carry +def : InstRW<[FXU_3cyc, GroupAlone], (instregex "ALC(R)?$")>; +def : InstRW<[FXU_3cyc, GroupAlone], (instregex "ALCG(R)?$")>; + +// Add with sign extension (32 -> 64) +def : InstRW<[FXU_2cyc], (instregex "AGF(R)?$")>; + +// Add halfword +def : InstRW<[FXU_2cyc], (instregex "AH(Y)?$")>; + +// Subtraction +def : InstRW<[FXU], (instregex "S(G|Y)?$")>; +def : InstRW<[FXU], (instregex "SGR(K)?$")>; +def : InstRW<[FXU], (instregex "SL(G|Y|FI)?$")>; +def : InstRW<[FXU], (instregex "SLGF(I|R)?$")>; +def : InstRW<[FXU], (instregex "SLGR(K)?$")>; +def : InstRW<[FXU], (instregex "SLL(G|K)?$")>; +def : InstRW<[FXU], (instregex "SLR(K)?$")>; +def : InstRW<[FXU], (instregex "SR(K)?$")>; +def : InstRW<[FXU], (instregex "SRA(G|K)?$")>; +def : InstRW<[FXU], (instregex "SRL(G|K)?$")>; + +// Subtraction with borrow +def : InstRW<[FXU_3cyc, GroupAlone], (instregex "SLB(G|R)?$")>; +def : InstRW<[FXU_3cyc, GroupAlone], (instregex "SLBGR$")>; + +// Subtraction with sign extension (32 -> 64) +def : InstRW<[FXU_2cyc], (instregex "SGF(R)?$")>; + +// Subtract halfword +def : InstRW<[FXU_2cyc], (instregex "SH(Y)?$")>; + +// Multiply +def : InstRW<[FXU_6cyc], (instregex "MS(R|Y|FI)?$")>; +def : InstRW<[FXU_8cyc], (instregex "MSG(R)?$")>; +def : InstRW<[FXU_6cyc], (instregex "MSGF(I|R)?$")>; +def : InstRW<[FXU_9cyc, GroupAlone], (instregex "MLG(R)?$")>; +def : InstRW<[FXU_5cyc], (instregex "MGHI$")>; +def : InstRW<[FXU_5cyc], (instregex "MH(I|Y)?$")>; + +// Divide +def : InstRW<[FPU_30cyc, FXU, FXU, FXU, FXU, GroupAlone], + (instregex "DSG(F)?R$")>; +def : InstRW<[FPU_30cyc, LSU, FXU, FXU, FXU, GroupAlone], + (instregex "DSG(F)?$")>; +def : InstRW<[FPU_15cyc, FXU, FXU, FXU, FXU, FXU, GroupAlone], + (instregex "DLR$")>; +def : InstRW<[FPU_30cyc, FXU, FXU, FXU, FXU, FXU, GroupAlone], + (instregex "DLGR$")>; +def : InstRW<[FPU_15cyc, LSU, FXU, FXU, FXU, FXU, GroupAlone], + (instregex "DL$")>; +def : InstRW<[FPU_30cyc, LSU, FXU, FXU, FXU, FXU, GroupAlone], + (instregex "DLG$")>; + +// And +def : InstRW<[FXU], (instregex "N(G|Y|TSTG)?$")>; +def : InstRW<[FXU], (instregex "NGR(K)?$")>; +def : InstRW<[FXU], (instregex "NI(Y|FMux|HMux|LMux)?$")>; +def : InstRW<[FXU], (instregex "NIHF(64)?$")>; +def : InstRW<[FXU], (instregex "NIHH(64)?$")>; +def : InstRW<[FXU], (instregex "NIHL(64)?$")>; +def : InstRW<[FXU], (instregex "NILF(64)?$")>; +def : InstRW<[FXU], (instregex "NILH(64)?$")>; +def : InstRW<[FXU], (instregex "NILL(64)?$")>; +def : InstRW<[FXU], (instregex "NR(K)?$")>; + +// Or +def : InstRW<[FXU], (instregex "O(G|Y)?$")>; +def : InstRW<[FXU], (instregex "OGR(K)?$")>; +def : InstRW<[FXU], (instregex "OI(Y|FMux|HMux|LMux)?$")>; +def : InstRW<[FXU], (instregex "OIHF(64)?$")>; +def : InstRW<[FXU], (instregex "OIHH(64)?$")>; +def : InstRW<[FXU], (instregex "OIHL(64)?$")>; +def : InstRW<[FXU], (instregex "OILF(64)?$")>; +def : InstRW<[FXU], (instregex "OILH(64)?$")>; +def : InstRW<[FXU], (instregex "OILL(64)?$")>; +def : InstRW<[FXU], (instregex "OR(K)?$")>; + +// Xor +def : InstRW<[FXU], (instregex "XI(Y)?$")>; +def : InstRW<[FXU], (instregex "X(G|Y|IFMux)?$")>; +def : InstRW<[FXU], (instregex "XGR(K)?$")>; +def : InstRW<[FXU], (instregex "XIHF(64)?$")>; +def : InstRW<[FXU], (instregex "XILF(64)?$")>; +def : InstRW<[FXU], (instregex "XR(K)?$")>; + +// Insert +def : InstRW<[FXU], (instregex "IC(Y)?$")>; +def : InstRW<[FXU], (instregex "IC32(Y)?$")>; +def : InstRW<[FXU], (instregex "II(F|H|L)Mux$")>; +def : InstRW<[FXU], (instregex "IIHF(64)?$")>; +def : InstRW<[FXU], (instregex "IIHH(64)?$")>; +def : InstRW<[FXU], (instregex "IIHL(64)?$")>; +def : InstRW<[FXU], (instregex "IILF(64)?$")>; +def : InstRW<[FXU], (instregex "IILH(64)?$")>; +def : InstRW<[FXU], (instregex "IILL(64)?$")>; + +// And / Or / Xor character +def : InstRW<[LSU, FXU, BeginGroup], (instregex "NC$")>; +def : InstRW<[LSU, FXU, BeginGroup], (instregex "OC$")>; +def : InstRW<[LSU, FXU, BeginGroup], (instregex "XC$")>; + +// Rotate +def : InstRW<[FXU], (instregex "RLL(G)?$")>; + +// Rotate and insert +def : InstRW<[FXU], (instregex "RISBG(N|32)?$")>; +def : InstRW<[FXU], (instregex "RISBH(G|H|L)$")>; +def : InstRW<[FXU], (instregex "RISBL(G|H|L)$")>; +def : InstRW<[FXU], (instregex "RISBMux$")>; + +// Rotate and Select +def : InstRW<[FXU, FXU_2cyc, GroupAlone], (instregex "R(N|O|X)SBG$")>; + +// Extend +def : InstRW<[FXU], (instregex "AEXT128_64$")>; +def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>; + +// Find leftmost one +def : InstRW<[FXU_7cyc, GroupAlone], (instregex "FLOGR$")>; + +// Population count +def : InstRW<[FXU_3cyc], (instregex "POPCNT$")>; + +// Compare +def : InstRW<[FXU], (instregex "CG$")>; +def : InstRW<[FXU], (instregex "C(G|Y|IH|Mux)?$")>; +def : InstRW<[FXU], (instregex "CFI(Mux)?$")>; +def : InstRW<[FXU], (instregex "CGFI$")>; +def : InstRW<[FXU], (instregex "CGH(I|SI)$")>; +def : InstRW<[FXU], (instregex "CGR(L)?$")>; +def : InstRW<[FXU], (instregex "CH(I|F|SI)$")>; +def : InstRW<[FXU], (instregex "CL(Y|Mux|FHSI)?$")>; +def : InstRW<[FXU], (instregex "CLFI(Mux)?$")>; +def : InstRW<[FXU], (instregex "CLG(HRL|HSI)?$")>; +def : InstRW<[FXU], (instregex "CLGF(I)?$")>; +def : InstRW<[FXU], (instregex "CLGFR(L)?$")>; +def : InstRW<[FXU], (instregex "CLGR(L)?$")>; +def : InstRW<[FXU], (instregex "CLH(F|RL|HSI)$")>; +def : InstRW<[FXU], (instregex "CLI(H|Y)?$")>; +def : InstRW<[FXU], (instregex "CLR(L)?$")>; +def : InstRW<[FXU], (instregex "CR(L)?$")>; + +// Compare halfword +def : InstRW<[FXU_2cyc], (instregex "CH(Y|RL)?$")>; +def : InstRW<[FXU_2cyc], (instregex "CGH(RL)?$")>; +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "CHHSI$")>; + +// Compare with sign extension (32 -> 64) +def : InstRW<[FXU_2cyc], (instregex "CGF(R|RL)?$")>; + +// Compare and swap +def : InstRW<[FXU, FXU, BeginGroup], (instregex "CS(G|Y)?$")>; + +// Compare logical character +def : InstRW<[FXU, LSU, BeginGroup], (instregex "CLC$")>; + +// Test under mask +def : InstRW<[FXU], (instregex "TM(Y|HMux|LMux)?$")>; +def : InstRW<[FXU], (instregex "TMHH(64)?$")>; +def : InstRW<[FXU], (instregex "TMHL(64)?$")>; +def : InstRW<[FXU], (instregex "TMLH(64)?$")>; +def : InstRW<[FXU], (instregex "TMLL(64)?$")>; + +// Load and test +def : InstRW<[FXU], (instregex "LT(R)?$")>; +def : InstRW<[FXU], (instregex "LTG(R)?$")>; +def : InstRW<[FXU], (instregex "LTGF(R)?$")>; + +// Moves +def : InstRW<[FXU], (instregex "MVGHI$")>; +def : InstRW<[FXU], (instregex "MVH(I|HI)$")>; +def : InstRW<[FXU], (instregex "MVI(Y)?$")>; + +// Move character +def : InstRW<[LSU_2cyc, LSU, FXU, BeginGroup], (instregex "MVC$")>; + +// Pseudo -> reg move +def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>; +def : InstRW<[FXU], (instregex "EXTRACT_SUBREG$")>; +def : InstRW<[FXU], (instregex "INSERT_SUBREG$")>; +def : InstRW<[FXU], (instregex "REG_SEQUENCE$")>; +def : InstRW<[FXU], (instregex "SUBREG_TO_REG$")>; + +// Loads (LSU) +def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux)?$")>; +def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>; +def : InstRW<[LSU], (instregex "LG(RL)?$")>; +def : InstRW<[LSU], (instregex "LLC(Mux)?$")>; +def : InstRW<[LSU], (instregex "LLG(C|F|H|FRL|HRL)$")>; +def : InstRW<[LSU], (instregex "LLH(RL|Mux)?$")>; +def : InstRW<[LSU], (instregex "L(X|128)$")>; + +// Loads (FXU) +def : InstRW<[FXU], (instregex "LLCH$")>; +def : InstRW<[FXU], (instregex "LLHH$")>; +def : InstRW<[FXU], (instregex "LLCR(Mux)?$")>; +def : InstRW<[FXU], (instregex "LLG(C|F|H)R$")>; +def : InstRW<[FXU], (instregex "LLHR(Mux)?$")>; +def : InstRW<[FXU], (instregex "LLIH(F|H|L)$")>; +def : InstRW<[FXU], (instregex "LLIL(F|H|L)$")>; +def : InstRW<[FXU], (instregex "LA(Y|RL)?$")>; +def : InstRW<[FXU], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY +def : InstRW<[FXU], (instregex "LAA(G)?$")>; +def : InstRW<[FXU], (instregex "LAAL(G)?$")>; +def : InstRW<[FXU], (instregex "LAN(G)?$")>; +def : InstRW<[FXU], (instregex "LAO(G)?$")>; +def : InstRW<[FXU], (instregex "LAX(G)?$")>; +def : InstRW<[FXU], (instregex "LB(H|R|Mux)?$")>; +def : InstRW<[FXU], (instregex "LGR$")>; +def : InstRW<[FXU], (instregex "LGB(R)?$")>; +def : InstRW<[FXU], (instregex "LGF(I)?$")>; +def : InstRW<[FXU], (instregex "LGFR(L)?$")>; +def : InstRW<[FXU], (instregex "LGH(I)?$")>; +def : InstRW<[FXU], (instregex "LGHR(L)?$")>; +def : InstRW<[FXU], (instregex "LH(H|I|Y|Mux|IMux)?$")>; +def : InstRW<[FXU], (instregex "LHR(L)?$")>; +def : InstRW<[FXU], (instregex "LR(Mux)?$")>; +def : InstRW<[FXU], (instregex "LRV(R)?$")>; +def : InstRW<[FXU], (instregex "LRVG(R)?$")>; + +// Load GR from FPR +def : InstRW<[FXU_3cyc], (instregex "LGDR$")>; + +// Load multiple (estimated average of 5 ops) +def : InstRW<[LSU_5cyc, LSU, LSU, LSU, LSU, GroupAlone], (instregex "LMG$")>; + +// Load Complement / Negative / Positive +def : InstRW<[FXU], (instregex "LC(R|GR)$")>; +def : InstRW<[FXU_2cyc], (instregex "LN(R|GR)$")>; +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "LCGFR$")>; +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "L(N|P)GFR$")>; +def : InstRW<[FXU_2cyc], (instregex "LP(G)?R$")>; + +// Load on condition +def : InstRW<[FXU_2cyc, EndGroup], (instregex "LOC(R)?$")>; +def : InstRW<[FXU_2cyc, EndGroup], (instregex "LOCG(R)?$")>; + +// Stores +def : InstRW<[FXU], (instregex "STG(RL)?$")>; +def : InstRW<[FXU], (instregex "ST(X|128)$")>; +def : InstRW<[FXU], (instregex "STH(H|Y|RL|Mux)?$")>; +def : InstRW<[FXU], (instregex "ST(Y|FH|RL|Mux)?$")>; +def : InstRW<[FXU], (instregex "STC(H|Y|Mux)?$")>; +def : InstRW<[FXU], (instregex "STRV(G)?$")>; + +// Store on condition / CondStore pseudos +def : InstRW<[FXU, EndGroup], (instregex "STOC(G)?$")>; +def : InstRW<[FXU], (instregex "CondStore16(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore16Mux(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore32(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore64(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore8(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStore8Mux(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStoreF32(Inv)?$")>; +def : InstRW<[FXU], (instregex "CondStoreF64(Inv)?$")>; + +// Store multiple (estimated average of 5 ops) +def : InstRW<[LSU, LSU, FXU_5cyc, FXU, FXU, FXU, FXU, + GroupAlone], (instregex "STMG$")>; + +// Select pseudo +def : InstRW<[FXU], (instregex "Select(32|64|F32|F64|F128|32Mux)$")>; + +// String instructions +def : InstRW<[FXU_30cyc], (instregex "SRST$")>; +def : InstRW<[LSU_30cyc, GroupAlone], (instregex "MVST$")>; +def : InstRW<[LSU_30cyc, GroupAlone], (instregex "CLST$")>; + +///// FLOATING POINT + +// Addition +def : InstRW<[FPU_Bcyc], (instregex "AEB(R)?$")>; +def : InstRW<[FPU_Bcyc], (instregex "ADB(R)?$")>; +def : InstRW<[FPU_20cyc, GroupAlone], (instregex "AXBR$")>; + +// Subtraction +def : InstRW<[FPU_Bcyc], (instregex "SEB(R)?$")>; +def : InstRW<[FPU_Bcyc], (instregex "SDB(R)?$")>; +def : InstRW<[FPU_20cyc, GroupAlone], (instregex "SXBR$")>; + +// Multiply +def : InstRW<[FPU_Bcyc], (instregex "MEEB(R)?$")>; +def : InstRW<[FPU_Bcyc], (instregex "MDB(R)?$")>; +def : InstRW<[FPU_Bcyc], (instregex "MDEB(R)?$")>; +def : InstRW<[FPU_Bplus2cyc, GroupAlone], (instregex "MXDB(R)?$")>; +def : InstRW<[FPU_30cyc, GroupAlone], (instregex "MXBR$")>; + +// Multiply and add / subtract +def : InstRW<[FPU_Bcyc, GroupAlone], (instregex "MAEB(R)?$")>; +def : InstRW<[FPU_Bcyc, GroupAlone], (instregex "MSEB(R)?$")>; +def : InstRW<[FPU_Bcyc, GroupAlone], (instregex "M(A|S)DBR$")>; +def : InstRW<[FPU_Bcyc, GroupAlone], (instregex "M(A|S)DB$")>; + +// Division +def : InstRW<[FPU_30cyc], (instregex "DEB(R)?$")>; +def : InstRW<[FPU_30cyc], (instregex "DDB(R)?$")>; +def : InstRW<[FPU_30cyc, GroupAlone], (instregex "DXBR$")>; + +// Square root +def : InstRW<[FPU_30cyc], (instregex "SQEB(R)?$")>; +def : InstRW<[FPU_30cyc], (instregex "SQDB(R)?$")>; +def : InstRW<[FPU_30cyc, GroupAlone], (instregex "SQXBR$")>; + +// Convert from fixed / logical +def : InstRW<[FXU, FPU_Bcyc, GroupAlone], (instregex "CE(F|G)BR$")>; +def : InstRW<[FXU, FPU_Bcyc, GroupAlone], (instregex "CD(F|G)BR$")>; +def : InstRW<[FXU, FPU_Bplus2cyc, GroupAlone], (instregex "CX(F|G)BR$")>; +def : InstRW<[FXU, FPU_Bcyc, BeginGroup], (instregex "CEL(F|G)BR$")>; +def : InstRW<[FXU, FPU_Bcyc, BeginGroup], (instregex "CDL(F|G)BR$")>; +def : InstRW<[FXU, FPU_Bplus2cyc, GroupAlone], (instregex "CXL(F|G)BR$")>; + +// Convert to fixed / logical +def : InstRW<[FXU, FPU_Bplus1cyc, GroupAlone], (instregex "CF(E|D|X)BR$")>; +def : InstRW<[FXU, FPU_Bplus1cyc, GroupAlone], (instregex "CG(E|D|X)BR$")>; +def : InstRW<[FXU, FPU_Bcyc, BeginGroup], (instregex "CLF(E|D)BR$")>; +def : InstRW<[FXU, FPU_Bcyc, GroupAlone], (instregex "CLG(E|D)BR$")>; +def : InstRW<[FXU, FPU_Bplus1cyc, BeginGroup], (instregex "CL(F|G)XBR$")>; + +// Copy sign +def : InstRW<[FXU, FXU_3cyc, BeginGroup], (instregex "CPSDRd(d|s)$")>; +def : InstRW<[FXU, FXU_3cyc, BeginGroup], (instregex "CPSDRs(d|s)$")>; + +// Compare +def : InstRW<[FPU_Bcyc], (instregex "CEB(R)?$")>; +def : InstRW<[FPU_Bcyc], (instregex "CDB(R)?$")>; +def : InstRW<[FPU_30cyc], (instregex "CXBR$")>; + +// Load and Test +def : InstRW<[FPU_Bcyc], (instregex "LT(D|E)BR$")>; +def : InstRW<[FPU_Bcyc], (instregex "LTEBRCompare(_VecPseudo)?$")>; +def : InstRW<[FPU_Bcyc], (instregex "LTDBRCompare(_VecPseudo)?$")>; +def : InstRW<[FPU_Bplus1cyc, GroupAlone], (instregex "LTXBR$")>; +def : InstRW<[FPU_Bplus1cyc, GroupAlone], + (instregex "LTXBRCompare(_VecPseudo)?$")>; + +// Load +def : InstRW<[LSU], (instregex "LE(Y)?$")>; +def : InstRW<[FXU], (instregex "LER$")>; +def : InstRW<[FXU], (instregex "LD(R|GR)$")>; +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "LXR$")>; + +// Load zero +def : InstRW<[FXU], (instregex "LZ(DR|ER)$")>; +def : InstRW<[FXU_2cyc, FXU, BeginGroup], (instregex "LZXR$")>; + +// Load Complement / Negative / Positive +def : InstRW<[FPU_Bcyc], (instregex "L(C|N|P)DBR$")>; +def : InstRW<[FPU_Bcyc], (instregex "L(C|N|P)EBR$")>; +def : InstRW<[FXU], (instregex "LCDFR(_32)?$")>; +def : InstRW<[FXU], (instregex "LNDFR(_32)?$")>; +def : InstRW<[FXU], (instregex "LPDFR(_32)?$")>; +def : InstRW<[FPU_Bplus1cyc, GroupAlone], (instregex "L(C|N|P)XBR$")>; + +// Load lengthened +def : InstRW<[FPU_Bcyc], (instregex "LDEB(R)?$")>; +def : InstRW<[FPU_Bplus2cyc, GroupAlone], (instregex "LX(D|E)B$")>; +def : InstRW<[FPU_Bplus2cyc, GroupAlone], (instregex "LX(D|E)BR$")>; + +// Load rounded +def : InstRW<[FPU_Bcyc], (instregex "LEDBR(A)?$")>; +def : InstRW<[FPU_Bplus2cyc], (instregex "LEXBR(A)?$")>; +def : InstRW<[FPU_Bplus2cyc], (instregex "LDXBR(A)?$")>; + +// Load FP integer +def : InstRW<[FPU_Bcyc], (instregex "FIEBR(A)?$")>; +def : InstRW<[FPU_Bcyc], (instregex "FIDBR(A)?$")>; +def : InstRW<[FPU_15cyc, GroupAlone], (instregex "FIXBR(A)?$")>; + +// Store +def : InstRW<[FXU_3cyc], (instregex "STD(Y)?$")>; +def : InstRW<[FXU_3cyc], (instregex "STE(Y)?$")>; + +///// INLINE ASSEMBLY + +def : InstRW<[FXU, LSU, BeginGroup], (instregex "STCK(F)?$")>; +def : InstRW<[LSU, LSU, FXU_2cyc, FXU, BeginGroup], (instregex "STCKE$")>; +def : InstRW<[FXU], (instregex "STFLE$")>; + +///// OTHER + +// Transaction begin +def : InstRW<[LSU, LSU, FXU_5cyc, FXU, FXU, FXU, FXU, GroupAlone], + (instregex "TBEGIN(C|_nofloat)?$")>; + +// Transaction end +def : InstRW<[LSU, GroupAlone], (instregex "TEND$")>; + +// Transaction abort +def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>; + +// Load the Global Offset Table address +def : InstRW<[FXU], (instregex "GOT$")>; + +// Prefetch data +def : InstRW<[LSU], (instregex "PFD(RL)?$")>; + +// Extract access register +def : InstRW<[LSU], (instregex "EAR$")>; + +// Insert Program Mask +def : InstRW<[FXU_3cyc, EndGroup], (instregex "IPM$")>; + +} + Index: lib/Target/SystemZ/SystemZSubtarget.h =================================================================== --- lib/Target/SystemZ/SystemZSubtarget.h +++ lib/Target/SystemZ/SystemZSubtarget.h @@ -73,6 +73,25 @@ return &TSInfo; } + bool isZ10() const { return getCPU().equals("z10"); } + + bool enableMachineScheduler() const override { + // Disabling mischeduler for z10 (failing test-case: fp-move-02.ll) + return (!isZ10()); + } + + // Returning true here (default) makes the DAG scheduler schedule + // for source order (if returning true above), if running MIScheduler. + bool enableMachineSchedDefaultSched() const override; + + bool keepSched_hasManyFoldable(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End) const; + + void overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) const override; + // This is important for reducing register pressure in vector code. bool useAA() const override { return true; } Index: lib/Target/SystemZ/SystemZSubtarget.cpp =================================================================== --- lib/Target/SystemZ/SystemZSubtarget.cpp +++ lib/Target/SystemZ/SystemZSubtarget.cpp @@ -10,9 +10,20 @@ #include "SystemZSubtarget.h" #include "MCTargetDesc/SystemZMCTargetDesc.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" using namespace llvm; +extern cl::opt SchedPref; + +static cl::opt +FoldableReloadHeuristic("foldable-reloads", cl::Hidden, + cl::desc("Consider reg->memory opcodes during mischeduling"), + cl::init(true)); + #define DEBUG_TYPE "systemz-subtarget" #define GET_SUBTARGETINFO_TARGET_DESC @@ -70,3 +81,22 @@ // case isn't easy to detect. return false; } + +bool SystemZSubtarget::enableMachineSchedDefaultSched() const { + return (SchedPref=="source"); +} + +void SystemZSubtarget:: +overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) const +{ + // Bidirectional scheduling pre-ra is benefitial according to benchmarks. + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + // Enable heuristic for foldable reloads, i.e. prefer to spill a + // register if it is read by an instruction who can fold the reload. + Policy.FoldableReloadHeuristic = FoldableReloadHeuristic; +} +} Index: lib/Target/SystemZ/SystemZTargetMachine.cpp =================================================================== --- lib/Target/SystemZ/SystemZTargetMachine.cpp +++ lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -9,10 +9,12 @@ #include "SystemZTargetMachine.h" #include "SystemZTargetTransformInfo.h" +#include "SystemZMachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Transforms/Scalar.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Support/Debug.h" using namespace llvm; @@ -104,6 +106,12 @@ return getTM(); } + ScheduleDAGInstrs * + createPostMachineScheduler(MachineSchedContext *C) const override { + return new ScheduleDAGMI(C, make_unique(C), + /*IsPostRA=*/true); + } + void addIRPasses() override; bool addInstSelector() override; void addPreSched2() override; @@ -168,12 +176,8 @@ // Do final scheduling after all other optimizations, to get an // optimal input for the decoder (branch relaxation must happen // after block placement). - if (getOptLevel() != CodeGenOpt::None) { - if (MISchedPostRA) - addPass(&PostMachineSchedulerID); - else - addPass(&PostRASchedulerID); - } + if (getOptLevel() != CodeGenOpt::None) + addPass(&PostMachineSchedulerID); } TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) { Index: test/CodeGen/SystemZ/alias-01.ll =================================================================== --- test/CodeGen/SystemZ/alias-01.ll +++ test/CodeGen/SystemZ/alias-01.ll @@ -1,4 +1,5 @@ -; Test 32-bit ANDs in which the second operand is variable. +; Check that spilling is not needed with 16 independent +; load-add-stores. ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s Index: test/CodeGen/SystemZ/alloca-01.ll =================================================================== --- test/CodeGen/SystemZ/alloca-01.ll +++ test/CodeGen/SystemZ/alloca-01.ll @@ -29,12 +29,14 @@ ; CHECK: lgr %r15, [[REG2]] ; ; CHECK-A-LABEL: f1: -; CHECK-A: lgr %r15, %r1 -; CHECK-A: la %r2, 176(%r1) +; CHECK-A-DAG: lgr %r15, %r1 +; CHECK-A-DAG: la %r2, 176(%r1) +; CHECK: br %r14 ; ; CHECK-B-LABEL: f1: ; CHECK-B: lgr %r15, %r1 -; CHECK-B: la %r3, 177(%r1) +; CHECK-B: la %r0, 177(%r1) +; CHECK-B: lgr %r3, %r0 ; ; CHECK-C-LABEL: f1: ; CHECK-C: lgr %r15, %r1 Index: test/CodeGen/SystemZ/alloca-02.ll =================================================================== --- test/CodeGen/SystemZ/alloca-02.ll +++ test/CodeGen/SystemZ/alloca-02.ll @@ -10,24 +10,24 @@ define i64 @f1(i64 %length, i64 %index) { ; CHECK-A-LABEL: f1: -; CHECK-A: lgr %r15, [[ADDR:%r[1-5]]] -; CHECK-A: la %r2, 160([[ADDR]]) +; CHECK-A-DAG: lgr %r15, [[ADDR:%r[1-5]]] +; CHECK-A-DAG: la %r2, 160([[ADDR]]) ; CHECK-A: mvi 0(%r2), 0 ; ; CHECK-B-LABEL: f1: -; CHECK-B: lgr %r15, [[ADDR:%r[1-5]]] -; CHECK-B: la %r2, 160([[ADDR]]) +; CHECK-B-DAG: lgr %r15, [[ADDR:%r[1-5]]] +; CHECK-B-DAG: la %r2, 160([[ADDR]]) ; CHECK-B: mvi 4095(%r2), 1 ; ; CHECK-C-LABEL: f1: -; CHECK-C: lgr %r15, [[ADDR:%r[1-5]]] -; CHECK-C-DAG: la %r2, 160([[ADDR]]) +; CHECK-C-DAG: la %r2, 160([[ADDR:%r[1-5]]]) +; CHECK-C-DAG: lgr %r15, [[ADDR]] ; CHECK-C-DAG: lhi [[TMP:%r[0-5]]], 2 ; CHECK-C: stc [[TMP]], 0({{%r3,%r2|%r2,%r3}}) ; ; CHECK-D-LABEL: f1: -; CHECK-D: lgr %r15, [[ADDR:%r[1-5]]] -; CHECK-D-DAG: la %r2, 160([[ADDR]]) +; CHECK-D-DAG: la %r2, 160([[ADDR:%r[1-5]]]) +; CHECK-D-DAG: lgr %r15, [[ADDR]] ; CHECK-D-DAG: lhi [[TMP:%r[0-5]]], 3 ; CHECK-D: stc [[TMP]], 4095({{%r3,%r2|%r2,%r3}}) ; Index: test/CodeGen/SystemZ/alloca-03.ll =================================================================== --- test/CodeGen/SystemZ/alloca-03.ll +++ test/CodeGen/SystemZ/alloca-03.ll @@ -15,13 +15,14 @@ ; Allocate %len * 8, no need to align stack. define void @f1(i64 %len) { ; CHECK-LABEL: f1: -; CHECK: sllg %r0, %r2, 3 -; CHECK: lgr %r1, %r15 +; CHECK-DAG: sllg %r0, %r2, 3 +; CHECK-DAG: lgr %r1, %r15 ; CHECK: sgr %r1, %r0 ; CHECK-NOT: ngr -; CHECK: lgr %r15, %r1 -; CHECK: la %r1, 160(%r1) -; CHECK: mvghi 0(%r1), 10 +; CHECK-DAG: lgr %r15, %r1 +; CHECK-DAG: la [[ADDR:%r[1-2]]], 160(%r1) +; CHECK-DAG: mvghi 0([[ADDR]]), 10 +; CHECK: br %r14 %x = alloca i64, i64 %len store volatile i64 10, i64* %x ret void @@ -31,10 +32,11 @@ define void @f2() { ; CHECK-LABEL: f2: ; CHECK: aghi %r1, -128 -; CHECK: lgr %r15, %r1 -; CHECK: la %r1, 280(%r1) -; CHECK: nill %r1, 65408 -; CHECK: mvghi 0(%r1), 10 +; CHECK-DAG: lgr %r15, %r1 +; CHECK-DAG: la [[ADDR:%r[1-2]]], 280(%r1) +; CHECK-DAG: nill [[ADDR]], 65408 +; CHECK-DAG: mvghi 0([[ADDR]]), 10 +; CHECK: br %r14 %x = alloca i64, i64 1, align 128 store volatile i64 10, i64* %x, align 128 ret void @@ -43,14 +45,14 @@ ; Dynamic alloca, align 128. define void @f3(i64 %len) { ; CHECK-LABEL: f3: -; CHECK: sllg %r1, %r2, 3 -; CHECK: la %r0, 120(%r1) -; CHECK: lgr %r1, %r15 +; CHECK-DAG: sllg [[ADDR:%r[1-2]]], %r2, 3 +; CHECK-DAG: la %r0, 120([[ADDR]]) +; CHECK-DAG: lgr %r1, %r15 ; CHECK: sgr %r1, %r0 -; CHECK: lgr %r15, %r1 -; CHECK: la %r1, 280(%r1) -; CHECK: nill %r1, 65408 -; CHECK: mvghi 0(%r1), 10 +; CHECK-DAG: lgr %r15, %r1 +; CHECK-DAG: la [[ADDR:%r[1-2]]], 280(%r1) +; CHECK-DAG: nill [[ADDR]], 65408 +; CHECK: mvghi 0([[ADDR]]), 10 %x = alloca i64, i64 %len, align 128 store volatile i64 10, i64* %x, align 128 ret void @@ -73,10 +75,10 @@ ; CHECK: lgr %r1, %r15 ; CHECK: aghi %r1, -128 -; CHECK: lgr %r15, %r1 -; CHECK: la %r1, 280(%r1) -; CHECK: nill %r1, 65408 -; CHECK: mvhi 0(%r1), 10 +; CHECK-DAG: lgr %r15, %r1 +; CHECK-DAG: la [[ADDR:%r[1-2]]], 280(%r1) +; CHECK-DAG: nill [[ADDR]], 65408 +; CHECK: mvhi 0([[ADDR]]), 10 %x = alloca i32, i64 1, align 128 store volatile i32 10, i32* %x ret void Index: test/CodeGen/SystemZ/args-01.ll =================================================================== --- test/CodeGen/SystemZ/args-01.ll +++ test/CodeGen/SystemZ/args-01.ll @@ -40,16 +40,16 @@ ; ; CHECK-FP128-1-LABEL: foo: ; CHECK-FP128-1: aghi %r15, -256 -; CHECK-FP128-1: lzxr %f0 -; CHECK-FP128-1-DAG: std %f0, 224(%r15) -; CHECK-FP128-1-DAG: std %f2, 232(%r15) +; CHECK-FP128-1: lzxr [[REG:%f[01]]] +; CHECK-FP128-1-DAG: std [[REG]], 224(%r15) +; CHECK-FP128-1-DAG: std %f{{2|3}}, 232(%r15) ; CHECK-FP128-1: brasl %r14, bar@PLT ; ; CHECK-FP128-2-LABEL: foo: ; CHECK-FP128-2: aghi %r15, -256 -; CHECK-FP128-2: lzxr %f0 -; CHECK-FP128-2-DAG: std %f0, 240(%r15) -; CHECK-FP128-2-DAG: std %f2, 248(%r15) +; CHECK-FP128-2: lzxr [[REG:%f[01]]] +; CHECK-FP128-2-DAG: std [[REG]], 240(%r15) +; CHECK-FP128-2-DAG: std %f{{2|3}}, 248(%r15) ; CHECK-FP128-2: brasl %r14, bar@PLT ; ; CHECK-STACK-LABEL: foo: Index: test/CodeGen/SystemZ/args-02.ll =================================================================== --- test/CodeGen/SystemZ/args-02.ll +++ test/CodeGen/SystemZ/args-02.ll @@ -41,16 +41,16 @@ ; ; CHECK-FP128-1-LABEL: foo: ; CHECK-FP128-1: aghi %r15, -256 -; CHECK-FP128-1: lzxr %f0 -; CHECK-FP128-1-DAG: std %f0, 224(%r15) -; CHECK-FP128-1-DAG: std %f2, 232(%r15) +; CHECK-FP128-1: lzxr [[REG:%f[01]]] +; CHECK-FP128-1-DAG: std [[REG]], 224(%r15) +; CHECK-FP128-1-DAG: std %f{{2|3}}, 232(%r15) ; CHECK-FP128-1: brasl %r14, bar@PLT ; ; CHECK-FP128-2-LABEL: foo: ; CHECK-FP128-2: aghi %r15, -256 -; CHECK-FP128-2: lzxr %f0 -; CHECK-FP128-2-DAG: std %f0, 240(%r15) -; CHECK-FP128-2-DAG: std %f2, 248(%r15) +; CHECK-FP128-2: lzxr [[REG:%f[01]]] +; CHECK-FP128-2-DAG: std [[REG]], 240(%r15) +; CHECK-FP128-2-DAG: std %f{{2|3}}, 248(%r15) ; CHECK-FP128-2: brasl %r14, bar@PLT ; ; CHECK-STACK-LABEL: foo: Index: test/CodeGen/SystemZ/args-03.ll =================================================================== --- test/CodeGen/SystemZ/args-03.ll +++ test/CodeGen/SystemZ/args-03.ll @@ -41,25 +41,25 @@ ; ; CHECK-FP128-1-LABEL: foo: ; CHECK-FP128-1: aghi %r15, -256 -; CHECK-FP128-1: lzxr %f0 -; CHECK-FP128-1-DAG: std %f0, 224(%r15) -; CHECK-FP128-1-DAG: std %f2, 232(%r15) +; CHECK-FP128-1: lzxr [[REG:%f[01]]] +; CHECK-FP128-1-DAG: std [[REG]], 224(%r15) +; CHECK-FP128-1-DAG: std %f{{2|3}}, 232(%r15) ; CHECK-FP128-1: brasl %r14, bar@PLT ; ; CHECK-FP128-2-LABEL: foo: ; CHECK-FP128-2: aghi %r15, -256 -; CHECK-FP128-2: lzxr %f0 -; CHECK-FP128-2-DAG: std %f0, 240(%r15) -; CHECK-FP128-2-DAG: std %f2, 248(%r15) +; CHECK-FP128-2: lzxr [[REG:%f[01]]] +; CHECK-FP128-2-DAG: std [[REG]], 240(%r15) +; CHECK-FP128-2-DAG: std %f{{2|3}}, 248(%r15) ; CHECK-FP128-2: brasl %r14, bar@PLT ; ; CHECK-STACK-LABEL: foo: -; CHECK-STACK: aghi %r15, -256 -; CHECK-STACK: la [[REGISTER:%r[0-5]+]], {{224|240}}(%r15) -; CHECK-STACK: stg [[REGISTER]], 216(%r15) -; CHECK-STACK: llilf [[AT184:%r[0-5]+]], 4294967288 -; CHECK-STACK: stg [[AT184]], 184(%r15) -; CHECK-STACK: llill [[AT176:%r[0-5]+]], 65529 +; CHECK-STACK-DAG: aghi %r15, -256 +; CHECK-STACK-DAG: la [[REGISTER:%r[0-5]+]], {{224|240}}(%r15) +; CHECK-STACK-DAG: stg [[REGISTER]], 216(%r15) +; CHECK-STACK-DAG: llilf [[AT184:%r[0-5]+]], 4294967288 +; CHECK-STACK-DAG: stg [[AT184]], 184(%r15) +; CHECK-STACK-DAG: llill [[AT176:%r[0-5]+]], 65529 ; CHECK-STACK: stg [[AT176]], 176(%r15) ; CHECK-STACK: mvghi 208(%r15), 0 ; CHECK-STACK: mvhi 204(%r15), 0 Index: test/CodeGen/SystemZ/args-06.ll =================================================================== --- test/CodeGen/SystemZ/args-06.ll +++ test/CodeGen/SystemZ/args-06.ll @@ -5,12 +5,13 @@ define i8 @f1(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g) { ; CHECK-LABEL: f1: -; CHECK: ar %r2, %r3 -; CHECK: ar %r2, %r4 -; CHECK: ar %r2, %r5 -; CHECK: ar %r2, %r6 -; CHECK: lb {{%r[0-5]}}, 167(%r15) -; CHECK: lb {{%r[0-5]}}, 175(%r15) +; CHECK-DAG: lb [[REG0:%r[0-5]]], 167(%r15) +; CHECK-DAG: lb [[REG1:%r[0-5]]], 175(%r15) +; CHECK-DAG: ar %r2, [[REG0]] +; CHECK-DAG: ar %r2, [[REG1]] +; CHECK-DAG: ar %r2, %r4 +; CHECK-DAG: ar %r2, %r5 +; CHECK-DAG: ar %r2, %r6 ; CHECK: br %r14 %addb = add i8 %a, %b %addc = add i8 %addb, %c Index: test/CodeGen/SystemZ/atomicrmw-minmax-03.ll =================================================================== --- test/CodeGen/SystemZ/atomicrmw-minmax-03.ll +++ test/CodeGen/SystemZ/atomicrmw-minmax-03.ll @@ -158,8 +158,8 @@ ; Check that constants are handled. define i32 @f13(i32 %dummy, i32 *%ptr) { ; CHECK-LABEL: f13: -; CHECK: lhi [[LIMIT:%r[0-9]+]], 42 -; CHECK: l %r2, 0(%r3) +; CHECK-DAG: lhi [[LIMIT:%r[0-9]+]], 42 +; CHECK-DAG: l %r2, 0(%r3) ; CHECK: [[LOOP:\.[^:]*]]: ; CHECK: lr [[NEW:%r[0-9]+]], %r2 ; CHECK: crjle %r2, [[LIMIT]], [[KEEP:\..*]] Index: test/CodeGen/SystemZ/atomicrmw-minmax-04.ll =================================================================== --- test/CodeGen/SystemZ/atomicrmw-minmax-04.ll +++ test/CodeGen/SystemZ/atomicrmw-minmax-04.ll @@ -125,8 +125,8 @@ ; Check that constants are handled. define i64 @f10(i64 %dummy, i64 *%ptr) { ; CHECK-LABEL: f10: -; CHECK: lghi [[LIMIT:%r[0-9]+]], 42 -; CHECK: lg %r2, 0(%r3) +; CHECK-DAG: lghi [[LIMIT:%r[0-9]+]], 42 +; CHECK-DAG: lg %r2, 0(%r3) ; CHECK: [[LOOP:\.[^:]*]]: ; CHECK: lgr [[NEW:%r[0-9]+]], %r2 ; CHECK: cgrjle %r2, [[LIMIT]], [[KEEP:\..*]] Index: test/CodeGen/SystemZ/atomicrmw-xchg-03.ll =================================================================== --- test/CodeGen/SystemZ/atomicrmw-xchg-03.ll +++ test/CodeGen/SystemZ/atomicrmw-xchg-03.ll @@ -110,8 +110,8 @@ ; use the sequence above. define i32 @f10(i32 %dummy, i32 *%src) { ; CHECK-LABEL: f10: -; CHECK: llill [[VALUE:%r[0-9+]]], 40000 -; CHECK: l %r2, 0(%r3) +; CHECK-DAG: llill [[VALUE:%r[0-9+]]], 40000 +; CHECK-DAG: l %r2, 0(%r3) ; CHECK: [[LABEL:\.[^:]*]]: ; CHECK: cs %r2, [[VALUE]], 0(%r3) ; CHECK: jl [[LABEL]] Index: test/CodeGen/SystemZ/atomicrmw-xchg-04.ll =================================================================== --- test/CodeGen/SystemZ/atomicrmw-xchg-04.ll +++ test/CodeGen/SystemZ/atomicrmw-xchg-04.ll @@ -77,8 +77,8 @@ ; use the sequence above. define i64 @f7(i64 %dummy, i64 *%ptr) { ; CHECK-LABEL: f7: -; CHECK: llilf [[VALUE:%r[0-9+]]], 3000000000 -; CHECK: lg %r2, 0(%r3) +; CHECK-DAG: llilf [[VALUE:%r[0-9+]]], 3000000000 +; CHECK-DAG: lg %r2, 0(%r3) ; CHECK: [[LABEL:\.[^:]*]]: ; CHECK: csg %r2, [[VALUE]], 0(%r3) ; CHECK: jl [[LABEL]] Index: test/CodeGen/SystemZ/branch-05.ll =================================================================== --- test/CodeGen/SystemZ/branch-05.ll +++ test/CodeGen/SystemZ/branch-05.ll @@ -6,9 +6,9 @@ ; CHECK-LABEL: f1: ; CHECK: ahi %r4, -1 ; CHECK: clijh %r4, 5, -; CHECK: llgfr [[OP64:%r[0-5]]], %r4 -; CHECK: sllg [[INDEX:%r[1-5]]], [[OP64]], 3 -; CHECK: larl [[BASE:%r[1-5]]] +; CHECK-DAG: llgfr [[OP64:%r[0-5]]], %r4 +; CHECK-DAG: sllg [[INDEX:%r[1-5]]], [[OP64]], 3 +; CHECK-DAG: larl [[BASE:%r[1-5]]] ; CHECK: lg [[TARGET:%r[1-5]]], 0([[BASE]],[[INDEX]]) ; CHECK: br [[TARGET]] entry: Index: test/CodeGen/SystemZ/call-03.ll =================================================================== --- test/CodeGen/SystemZ/call-03.ll +++ test/CodeGen/SystemZ/call-03.ll @@ -64,7 +64,7 @@ ; the target register is %r1. define void @f5(void(i32, i32, i32, i32) *%foo) { ; CHECK-LABEL: f5: -; CHECK: lgr %r1, %r2 +; CHECK: lgr %r{{[0-1]}}, %r2 ; CHECK-DAG: lhi %r2, 1 ; CHECK-DAG: lhi %r3, 2 ; CHECK-DAG: lhi %r4, 3 Index: test/CodeGen/SystemZ/fp-add-03.ll =================================================================== --- test/CodeGen/SystemZ/fp-add-03.ll +++ test/CodeGen/SystemZ/fp-add-03.ll @@ -5,12 +5,12 @@ ; There is no memory form of 128-bit addition. define void @f1(fp128 *%ptr, float %f2) { ; CHECK-LABEL: f1: -; CHECK: lxebr %f0, %f0 -; CHECK: ld %f1, 0(%r2) -; CHECK: ld %f3, 8(%r2) -; CHECK: axbr %f1, %f0 -; CHECK: std %f1, 0(%r2) -; CHECK: std %f3, 8(%r2) +; CHECK-DAG: lxebr %f0, %f0 +; CHECK-DAG: ld %f1, 0(%r2) +; CHECK-DAG: ld %f3, 8(%r2) +; CHECK: axbr [[REGISTER:%f[0-1]+]], %f{{0|1}} +; CHECK: std [[REGISTER]], 0(%r2) +; CHECK: std %f{{2|3}}, 8(%r2) ; CHECK: br %r14 %f1 = load fp128 , fp128 *%ptr %f2x = fpext float %f2 to fp128 Index: test/CodeGen/SystemZ/fp-cmp-03.ll =================================================================== --- test/CodeGen/SystemZ/fp-cmp-03.ll +++ test/CodeGen/SystemZ/fp-cmp-03.ll @@ -6,9 +6,9 @@ ; There is no memory form of 128-bit comparison. define i64 @f1(i64 %a, i64 %b, fp128 *%ptr, float %f2) { ; CHECK-LABEL: f1: -; CHECK: lxebr %f0, %f0 -; CHECK: ld %f1, 0(%r4) -; CHECK: ld %f3, 8(%r4) +; CHECK-DAG: lxebr %f0, %f0 +; CHECK-DAG: ld %f1, 0(%r4) +; CHECK-DAG: ld %f3, 8(%r4) ; CHECK: cxbr %f1, %f0 ; CHECK-NEXT: je ; CHECK: lgr %r2, %r3 Index: test/CodeGen/SystemZ/fp-cmp-04.ll =================================================================== --- test/CodeGen/SystemZ/fp-cmp-04.ll +++ test/CodeGen/SystemZ/fp-cmp-04.ll @@ -275,10 +275,10 @@ define void @f14(fp128 *%ptr1, fp128 *%ptr2) { ; CHECK-LABEL: f14: ; CHECK: ltxbr -; CHECK-NEXT: dxbr -; CHECK-NEXT: std -; CHECK-NEXT: std -; CHECK-NEXT: mxbr +; CHECK-DAG: dxbr +; CHECK-DAG: mxbr +; CHECK-DAG: std +; CHECK-DAG: std ; CHECK-NEXT: std ; CHECK-NEXT: std ; CHECK-NEXT: jl .L{{.*}} Index: test/CodeGen/SystemZ/fp-div-03.ll =================================================================== --- test/CodeGen/SystemZ/fp-div-03.ll +++ test/CodeGen/SystemZ/fp-div-03.ll @@ -5,9 +5,9 @@ ; There is no memory form of 128-bit division. define void @f1(fp128 *%ptr, float %f2) { ; CHECK-LABEL: f1: -; CHECK: lxebr %f0, %f0 -; CHECK: ld %f1, 0(%r2) -; CHECK: ld %f3, 8(%r2) +; CHECK-DAG: lxebr %f0, %f0 +; CHECK-DAG: ld %f1, 0(%r2) +; CHECK-DAG: ld %f3, 8(%r2) ; CHECK: dxbr %f1, %f0 ; CHECK: std %f1, 0(%r2) ; CHECK: std %f3, 8(%r2) Index: test/CodeGen/SystemZ/fp-mul-04.ll =================================================================== --- test/CodeGen/SystemZ/fp-mul-04.ll +++ test/CodeGen/SystemZ/fp-mul-04.ll @@ -108,7 +108,7 @@ define double @f7(double *%ptr0) { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT -; CHECK: mxdb %f0, 160(%r15) +; CHECK: mxdb %f{{0|1}}, 160(%r15) ; CHECK: br %r14 %ptr1 = getelementptr double, double *%ptr0, i64 2 %ptr2 = getelementptr double, double *%ptr0, i64 4 Index: test/CodeGen/SystemZ/fp-mul-05.ll =================================================================== --- test/CodeGen/SystemZ/fp-mul-05.ll +++ test/CodeGen/SystemZ/fp-mul-05.ll @@ -5,12 +5,12 @@ ; There is no memory form of 128-bit multiplication. define void @f1(fp128 *%ptr, float %f2) { ; CHECK-LABEL: f1: -; CHECK: lxebr %f0, %f0 -; CHECK: ld %f1, 0(%r2) -; CHECK: ld %f3, 8(%r2) -; CHECK: mxbr %f1, %f0 -; CHECK: std %f1, 0(%r2) -; CHECK: std %f3, 8(%r2) +; CHECK-DAG: lxebr %f0, %f0 +; CHECK-DAG: ld %f1, 0(%r2) +; CHECK-DAG: ld %f3, 8(%r2) +; CHECK: mxbr [[REG:%f[0-1]]], %f{{0|1}} +; CHECK: std [[REG]], 0(%r2) +; CHECK: std %f{{2|3}}, 8(%r2) ; CHECK: br %r14 %f1 = load fp128 , fp128 *%ptr %f2x = fpext float %f2 to fp128 Index: test/CodeGen/SystemZ/fp-sub-03.ll =================================================================== --- test/CodeGen/SystemZ/fp-sub-03.ll +++ test/CodeGen/SystemZ/fp-sub-03.ll @@ -5,9 +5,9 @@ ; There is no memory form of 128-bit subtraction. define void @f1(fp128 *%ptr, float %f2) { ; CHECK-LABEL: f1: -; CHECK: lxebr %f0, %f0 -; CHECK: ld %f1, 0(%r2) -; CHECK: ld %f3, 8(%r2) +; CHECK-DAG: lxebr %f0, %f0 +; CHECK-DAG: ld %f1, 0(%r2) +; CHECK-DAG: ld %f3, 8(%r2) ; CHECK: sxbr %f1, %f0 ; CHECK: std %f1, 0(%r2) ; CHECK: std %f3, 8(%r2) Index: test/CodeGen/SystemZ/vec-args-06.ll =================================================================== --- test/CodeGen/SystemZ/vec-args-06.ll +++ test/CodeGen/SystemZ/vec-args-06.ll @@ -42,29 +42,29 @@ ; CHECK-LABEL: f2: ; CHECK: larl [[TMP:%r[0-5]]], .LCPI ; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]]) -; CHECK: vst [[VTMP]], 128(%r2) -; CHECK: larl [[TMP:%r[0-5]]], .LCPI +; CHECK-DAG: vst [[VTMP]], 128(%r2) +; CHECK-DAG: larl [[TMP:%r[0-5]]], .LCPI ; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]]) -; CHECK: vst [[VTMP]], 112(%r2) -; CHECK: larl [[TMP:%r[0-5]]], .LCPI +; CHECK-DAG: vst [[VTMP]], 112(%r2) +; CHECK-DAG: larl [[TMP:%r[0-5]]], .LCPI ; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]]) -; CHECK: vst [[VTMP]], 96(%r2) -; CHECK: larl [[TMP:%r[0-5]]], .LCPI +; CHECK-DAG: vst [[VTMP]], 96(%r2) +; CHECK-DAG: larl [[TMP:%r[0-5]]], .LCPI ; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]]) -; CHECK: vst [[VTMP]], 80(%r2) -; CHECK: larl [[TMP:%r[0-5]]], .LCPI +; CHECK-DAG: vst [[VTMP]], 80(%r2) +; CHECK-DAG: larl [[TMP:%r[0-5]]], .LCPI ; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]]) -; CHECK: vst [[VTMP]], 64(%r2) -; CHECK: larl [[TMP:%r[0-5]]], .LCPI +; CHECK-DAG: vst [[VTMP]], 64(%r2) +; CHECK-DAG: larl [[TMP:%r[0-5]]], .LCPI ; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]]) -; CHECK: vst [[VTMP]], 48(%r2) -; CHECK: larl [[TMP:%r[0-5]]], .LCPI +; CHECK-DAG: vst [[VTMP]], 48(%r2) +; CHECK-DAG: larl [[TMP:%r[0-5]]], .LCPI ; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]]) -; CHECK: vst [[VTMP]], 32(%r2) -; CHECK: larl [[TMP:%r[0-5]]], .LCPI +; CHECK-DAG: vst [[VTMP]], 32(%r2) +; CHECK-DAG: larl [[TMP:%r[0-5]]], .LCPI ; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]]) -; CHECK: vst [[VTMP]], 16(%r2) -; CHECK: larl [[TMP:%r[0-5]]], .LCPI +; CHECK-DAG: vst [[VTMP]], 16(%r2) +; CHECK-DAG: larl [[TMP:%r[0-5]]], .LCPI ; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]]) ; CHECK: vst [[VTMP]], 0(%r2) ; CHECK: br %r14 Index: test/CodeGen/SystemZ/vec-perm-12.ll =================================================================== --- test/CodeGen/SystemZ/vec-perm-12.ll +++ test/CodeGen/SystemZ/vec-perm-12.ll @@ -7,9 +7,9 @@ define <4 x i32> @f1(<4 x i32> %x, i64 %y) { ; CHECK-CODE-LABEL: f1: -; CHECK-CODE: vlvgf [[ELT:%v[0-9]+]], %r2, 0 -; CHECK-CODE: larl [[REG:%r[0-5]]], -; CHECK-CODE: vl [[MASK:%v[0-9]+]], 0([[REG]]) +; CHECK-CODE-DAG: vlvgf [[ELT:%v[0-9]+]], %r2, 0 +; CHECK-CODE-DAG: larl [[REG:%r[0-5]]], +; CHECK-CODE-DAG: vl [[MASK:%v[0-9]+]], 0([[REG]]) ; CHECK-CODE: vperm %v24, %v24, [[ELT]], [[MASK]] ; CHECK-CODE: br %r14 Index: test/CodeGen/SystemZ/vec-perm-13.ll =================================================================== --- test/CodeGen/SystemZ/vec-perm-13.ll +++ test/CodeGen/SystemZ/vec-perm-13.ll @@ -7,9 +7,9 @@ define <4 x i16> @f1(<4 x i16> %x) { ; CHECK-CODE-LABEL: f1: -; CHECK-CODE: larl [[REG:%r[0-5]]], -; CHECK-CODE: vl [[MASK:%v[0-9]+]], 0([[REG]]) -; CHECK-CODE: vgbm [[ELT:%v[0-9]+]], 0 +; CHECK-CODE-DAG: larl [[REG:%r[0-5]]], +; CHECK-CODE-DAG: vl [[MASK:%v[0-9]+]], 0([[REG]]) +; CHECK-CODE-DAG: vgbm [[ELT:%v[0-9]+]], 0 ; CHECK-CODE: vperm %v24, %v24, [[ELT]], [[MASK]] ; CHECK-CODE: br %r14 Index: test/CodeGen/SystemZ/vec-sub-01.ll =================================================================== --- test/CodeGen/SystemZ/vec-sub-01.ll +++ test/CodeGen/SystemZ/vec-sub-01.ll @@ -38,10 +38,9 @@ ret <2 x i64> %ret } -; Test a v4f32 subtraction, as an example of an operation that needs to be -; scalarized and reassembled. At present there's an unnecessary move that -; could be avoided with smarter ordering. It also isn't important whether -; the VSLDBs use the result of the VLRs or use %v24 and %v26 directly. +; Test a v4f32 subtraction, as an example of an operation that needs +; to be scalarized and reassembled. It isn't important whether the +; VSLDBs use the result of the VLRs or use %v24 and %v26 directly. define <4 x float> @f5(<4 x float> %val1, <4 x float> %val2) { ; CHECK-LABEL: f5: ; CHECK-DAG: vlr %v[[A1:[0-5]]], %v24 @@ -52,12 +51,11 @@ ; CHECK-DAG: vrepf %v[[C2:[0-5]]], %v[[A2]], 2 ; CHECK-DAG: vrepf %v[[D1:[0-5]]], %v[[A1]], 3 ; CHECK-DAG: vrepf %v[[D2:[0-5]]], %v[[A2]], 3 -; CHECK-DAG: ler %f[[A1copy:[0-5]]], %f[[A1]] -; CHECK-DAG: sebr %f[[A1copy]], %f[[A2]] +; CHECK-DAG: sebr %f[[A1]], %f[[A2]] ; CHECK-DAG: sebr %f[[B1]], %f[[B2]] ; CHECK-DAG: sebr %f[[C1]], %f[[C2]] ; CHECK-DAG: sebr %f[[D1]], %f[[D2]] -; CHECK-DAG: vmrhf [[HIGH:%v[0-9]+]], %v[[A1copy]], %v[[B1]] +; CHECK-DAG: vmrhf [[HIGH:%v[0-9]+]], %v[[A1]], %v[[B1]] ; CHECK-DAG: vmrhf [[LOW:%v[0-9]+]], %v[[C1]], %v[[D1]] ; CHECK: vmrhg %v24, [[HIGH]], [[LOW]] ; CHECK: br %r14