Diff 61862

llvm/trunk/include/llvm/CodeGen/MachineScheduler.h

Show First 20 Lines • Show All 757 Lines • ▼ Show 20 Lines

/// Base class for GenericScheduler. This class maintains information about		/// Base class for GenericScheduler. This class maintains information about
/// scheduling candidates based on TargetSchedModel making it easy to implement		/// scheduling candidates based on TargetSchedModel making it easy to implement
/// heuristics for either preRA or postRA scheduling.		/// heuristics for either preRA or postRA scheduling.
class GenericSchedulerBase : public MachineSchedStrategy {		class GenericSchedulerBase : public MachineSchedStrategy {
public:		public:
/// Represent the type of SchedCandidate found within a single queue.		/// Represent the type of SchedCandidate found within a single queue.
/// pickNodeBidirectional depends on these listed by decreasing priority.		/// pickNodeBidirectional depends on these listed by decreasing priority.
enum CandReason {		enum CandReason : uint8_t {
NoCand, Only1, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak,		NoCand, Only1, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak,
RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,		RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};		TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};

#ifndef NDEBUG		#ifndef NDEBUG
static const char *getReasonStr(GenericSchedulerBase::CandReason Reason);		static const char *getReasonStr(GenericSchedulerBase::CandReason Reason);
#endif		#endif

Show All 31 Lines	struct SchedCandidate {
CandPolicy Policy;		CandPolicy Policy;

// The best SUnit candidate.		// The best SUnit candidate.
SUnit *SU;		SUnit *SU;

// The reason for this candidate.		// The reason for this candidate.
CandReason Reason;		CandReason Reason;

// Set of reasons that apply to multiple candidates.		// Whether this candidate should be scheduled at top/bottom.
uint32_t RepeatReasonSet;		bool AtTop;

// Register pressure values for the best candidate.		// Register pressure values for the best candidate.
RegPressureDelta RPDelta;		RegPressureDelta RPDelta;

// Critical resource consumption of the best candidate.		// Critical resource consumption of the best candidate.
SchedResourceDelta ResDelta;		SchedResourceDelta ResDelta;

SchedCandidate(const CandPolicy &policy)		SchedCandidate(const CandPolicy &policy)
: Policy(policy), SU(nullptr), Reason(NoCand), RepeatReasonSet(0) {}		: Policy(policy), SU(nullptr), Reason(NoCand), AtTop(false) {}

bool isValid() const { return SU; }		bool isValid() const { return SU; }

// Copy the status of another candidate without changing policy.		// Copy the status of another candidate without changing policy.
void setBest(SchedCandidate &Best) {		void setBest(SchedCandidate &Best) {
assert(Best.Reason != NoCand && "uninitialized Sched candidate");		assert(Best.Reason != NoCand && "uninitialized Sched candidate");
SU = Best.SU;		SU = Best.SU;
Reason = Best.Reason;		Reason = Best.Reason;
		AtTop = Best.AtTop;
RPDelta = Best.RPDelta;		RPDelta = Best.RPDelta;
ResDelta = Best.ResDelta;		ResDelta = Best.ResDelta;
}		}

bool isRepeat(CandReason R) { return RepeatReasonSet & (1 << R); }
void setRepeat(CandReason R) { RepeatReasonSet \|= (1 << R); }

void initResourceDelta(const ScheduleDAGMI *DAG,		void initResourceDelta(const ScheduleDAGMI *DAG,
const TargetSchedModel *SchedModel);		const TargetSchedModel *SchedModel);
};		};

protected:		protected:
const MachineSchedContext *Context;		const MachineSchedContext *Context;
const TargetSchedModel *SchedModel;		const TargetSchedModel *SchedModel;
const TargetRegisterInfo *TRI;		const TargetRegisterInfo *TRI;
▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines	protected:
void checkAcyclicLatency();		void checkAcyclicLatency();

void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,		void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,
const RegPressureTracker &RPTracker,		const RegPressureTracker &RPTracker,
RegPressureTracker &TempTracker);		RegPressureTracker &TempTracker);

void tryCandidate(SchedCandidate &Cand,		void tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,		SchedCandidate &TryCand,
SchedBoundary &Zone);		SchedBoundary *Zone);

SUnit *pickNodeBidirectional(bool &IsTopNode);		SUnit *pickNodeBidirectional(bool &IsTopNode);

void pickNodeFromQueue(SchedBoundary &Zone,		void pickNodeFromQueue(SchedBoundary &Zone,
		const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,		const RegPressureTracker &RPTracker,
SchedCandidate &Candidate);		SchedCandidate &Candidate);

void reschedulePhysRegCopies(SUnit *SU, bool isTop);		void reschedulePhysRegCopies(SUnit *SU, bool isTop);
};		};

/// PostGenericScheduler - Interface to the scheduling algorithm used by		/// PostGenericScheduler - Interface to the scheduling algorithm used by
/// ScheduleDAGMI.		/// ScheduleDAGMI.
▲ Show 20 Lines • Show All 52 Lines • Show Last 20 Lines

llvm/trunk/lib/CodeGen/MachineScheduler.cpp

Show First 20 Lines • Show All 2,472 Lines • ▼ Show 20 Lines	if (TryVal < CandVal) {
TryCand.Reason = Reason;		TryCand.Reason = Reason;
return true;		return true;
}		}
if (TryVal > CandVal) {		if (TryVal > CandVal) {
if (Cand.Reason > Reason)		if (Cand.Reason > Reason)
Cand.Reason = Reason;		Cand.Reason = Reason;
return true;		return true;
}		}
Cand.setRepeat(Reason);
return false;		return false;
}		}

static bool tryGreater(int TryVal, int CandVal,		static bool tryGreater(int TryVal, int CandVal,
GenericSchedulerBase::SchedCandidate &TryCand,		GenericSchedulerBase::SchedCandidate &TryCand,
GenericSchedulerBase::SchedCandidate &Cand,		GenericSchedulerBase::SchedCandidate &Cand,
GenericSchedulerBase::CandReason Reason) {		GenericSchedulerBase::CandReason Reason) {
if (TryVal > CandVal) {		if (TryVal > CandVal) {
TryCand.Reason = Reason;		TryCand.Reason = Reason;
return true;		return true;
}		}
if (TryVal < CandVal) {		if (TryVal < CandVal) {
if (Cand.Reason > Reason)		if (Cand.Reason > Reason)
Cand.Reason = Reason;		Cand.Reason = Reason;
return true;		return true;
}		}
Cand.setRepeat(Reason);
return false;		return false;
}		}

static bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,		static bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
GenericSchedulerBase::SchedCandidate &Cand,		GenericSchedulerBase::SchedCandidate &Cand,
SchedBoundary &Zone) {		SchedBoundary &Zone) {
if (Zone.isTop()) {		if (Zone.isTop()) {
if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {		if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
Show All 17 Lines	static bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
return false;		return false;
}		}

static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) {		static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) {
DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")		DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
<< GenericSchedulerBase::getReasonStr(Reason) << '\n');		<< GenericSchedulerBase::getReasonStr(Reason) << '\n');
}		}

static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand,		static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) {
bool IsTop) {		tracePick(Cand.Reason, Cand.AtTop);
tracePick(Cand.Reason, IsTop);
}		}

void GenericScheduler::initialize(ScheduleDAGMI *dag) {		void GenericScheduler::initialize(ScheduleDAGMI *dag) {
assert(dag->hasVRegLiveness() &&		assert(dag->hasVRegLiveness() &&
"(PreRA)GenericScheduler needs vreg liveness");		"(PreRA)GenericScheduler needs vreg liveness");
DAG = static_cast<ScheduleDAGMILive*>(dag);		DAG = static_cast<ScheduleDAGMILive*>(dag);
SchedModel = DAG->getSchedModel();		SchedModel = DAG->getSchedModel();
TRI = DAG->TRI;		TRI = DAG->TRI;
▲ Show 20 Lines • Show All 134 Lines • ▼ Show 20 Lines

static bool tryPressure(const PressureChange &TryP,		static bool tryPressure(const PressureChange &TryP,
const PressureChange &CandP,		const PressureChange &CandP,
GenericSchedulerBase::SchedCandidate &TryCand,		GenericSchedulerBase::SchedCandidate &TryCand,
GenericSchedulerBase::SchedCandidate &Cand,		GenericSchedulerBase::SchedCandidate &Cand,
GenericSchedulerBase::CandReason Reason,		GenericSchedulerBase::CandReason Reason,
const TargetRegisterInfo *TRI,		const TargetRegisterInfo *TRI,
const MachineFunction &MF) {		const MachineFunction &MF) {
unsigned TryPSet = TryP.getPSetOrMax();
unsigned CandPSet = CandP.getPSetOrMax();
// If both candidates affect the same set, go with the smallest increase.
if (TryPSet == CandPSet) {
return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
Reason);
}
// If one candidate decreases and the other increases, go with it.		// If one candidate decreases and the other increases, go with it.
// Invalid candidates have UnitInc==0.		// Invalid candidates have UnitInc==0.
if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,		if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
Reason)) {		Reason)) {
return true;		return true;
}		}
		// Do not compare the magnitude of pressure changes between top and bottom
		// boundary.
		if (Cand.AtTop != TryCand.AtTop)
		return false;

		// If both candidates affect the same set in the same boundary, go with the
		// smallest increase.
		unsigned TryPSet = TryP.getPSetOrMax();
		unsigned CandPSet = CandP.getPSetOrMax();
		if (TryPSet == CandPSet) {
		return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
		Reason);
		}

int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) :		int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) :
std::numeric_limits<int>::max();		std::numeric_limits<int>::max();

int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) :		int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) :
std::numeric_limits<int>::max();		std::numeric_limits<int>::max();

// If the candidates are decreasing pressure, reverse priority.		// If the candidates are decreasing pressure, reverse priority.
Show All 34 Lines	static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
return 0;		return 0;
}		}

void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,		void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
bool AtTop,		bool AtTop,
const RegPressureTracker &RPTracker,		const RegPressureTracker &RPTracker,
RegPressureTracker &TempTracker) {		RegPressureTracker &TempTracker) {
Cand.SU = SU;		Cand.SU = SU;
		Cand.AtTop = AtTop;
if (DAG->isTrackingPressure()) {		if (DAG->isTrackingPressure()) {
if (AtTop) {		if (AtTop) {
TempTracker.getMaxDownwardPressureDelta(		TempTracker.getMaxDownwardPressureDelta(
Cand.SU->getInstr(),		Cand.SU->getInstr(),
Cand.RPDelta,		Cand.RPDelta,
DAG->getRegionCriticalPSets(),		DAG->getRegionCriticalPSets(),
DAG->getRegPressure().MaxSetPressure);		DAG->getRegPressure().MaxSetPressure);
} else {		} else {
Show All 23 Lines
/// Apply a set of heursitics to a new candidate. Heuristics are currently		/// Apply a set of heursitics to a new candidate. Heuristics are currently
/// hierarchical. This may be more efficient than a graduated cost model because		/// hierarchical. This may be more efficient than a graduated cost model because
/// we don't need to evaluate all aspects of the model for each node in the		/// we don't need to evaluate all aspects of the model for each node in the
/// queue. But it's really done to make the heuristics easier to debug and		/// queue. But it's really done to make the heuristics easier to debug and
/// statistically analyze.		/// statistically analyze.
///		///
/// \param Cand provides the policy and current best candidate.		/// \param Cand provides the policy and current best candidate.
/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.		/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
/// \param Zone describes the scheduled zone that we are extending.		/// \param Zone describes the scheduled zone that we are extending, or nullptr
		// if Cand is from a different zone than TryCand.
void GenericScheduler::tryCandidate(SchedCandidate &Cand,		void GenericScheduler::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,		SchedCandidate &TryCand,
SchedBoundary &Zone) {		SchedBoundary *Zone) {
// Initialize the candidate if needed.		// Initialize the candidate if needed.
if (!Cand.isValid()) {		if (!Cand.isValid()) {
TryCand.Reason = NodeOrder;		TryCand.Reason = NodeOrder;
return;		return;
}		}

if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()),		if (tryGreater(biasPhysRegCopy(TryCand.SU, TryCand.AtTop),
biasPhysRegCopy(Cand.SU, Zone.isTop()),		biasPhysRegCopy(Cand.SU, Cand.AtTop),
TryCand, Cand, PhysRegCopy))		TryCand, Cand, PhysRegCopy))
return;		return;

// Avoid exceeding the target's limit.		// Avoid exceeding the target's limit.
if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,		if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
Cand.RPDelta.Excess,		Cand.RPDelta.Excess,
TryCand, Cand, RegExcess, TRI,		TryCand, Cand, RegExcess, TRI,
DAG->MF))		DAG->MF))
return;		return;

// Avoid increasing the max critical pressure in the scheduled region.		// Avoid increasing the max critical pressure in the scheduled region.
if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,		if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
Cand.RPDelta.CriticalMax,		Cand.RPDelta.CriticalMax,
TryCand, Cand, RegCritical, TRI,		TryCand, Cand, RegCritical, TRI,
DAG->MF))		DAG->MF))
return;		return;

// For loops that are acyclic path limited, aggressively schedule for latency.		// We only compare a subset of features when comparing nodes between
// This can result in very long dependence chains scheduled in sequence, so		// Top and Bottom boundary. Some properties are simply incomparable, in many
// once every cycle (when CurrMOps == 0), switch to normal heuristics.		// other instances we should only override the other boundary if something
if (Rem.IsAcyclicLatencyLimited && !Zone.getCurrMOps()		// is a clear good pick on one boundary. Skip heuristics that are more
&& tryLatency(TryCand, Cand, Zone))		// "tie-breaking" in nature.
		bool SameBoundary = Zone != nullptr;
		if (SameBoundary) {
		// For loops that are acyclic path limited, aggressively schedule for
		// latency. This can result in very long dependence chains scheduled in
		// sequence, so once every cycle (when CurrMOps == 0), switch to normal
		// heuristics.
		if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
		tryLatency(TryCand, Cand, *Zone))
return;		return;

// Prioritize instructions that read unbuffered resources by stall cycles.		// Prioritize instructions that read unbuffered resources by stall cycles.
if (tryLess(Zone.getLatencyStallCycles(TryCand.SU),		if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
Zone.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))		Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
return;		return;
		}

// Keep clustered nodes together to encourage downstream peephole		// Keep clustered nodes together to encourage downstream peephole
// optimizations which may reduce resource requirements.		// optimizations which may reduce resource requirements.
//		//
// This is a best effort to set things up for a post-RA pass. Optimizations		// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within		// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.		// the scheduler pass by combining the loads during DAG postprocessing.
const SUnit *NextClusterSU =		const SUnit *CandNextClusterSU =
Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();		Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU,		const SUnit *TryCandNextClusterSU =
		TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
		if (tryGreater(TryCand.SU == TryCandNextClusterSU,
		Cand.SU == CandNextClusterSU,
TryCand, Cand, Cluster))		TryCand, Cand, Cluster))
return;		return;

		if (SameBoundary) {
// Weak edges are for clustering and other constraints.		// Weak edges are for clustering and other constraints.
if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),		if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
getWeakLeft(Cand.SU, Zone.isTop()),		getWeakLeft(Cand.SU, Cand.AtTop),
TryCand, Cand, Weak)) {		TryCand, Cand, Weak))
return;		return;
}		}

// Avoid increasing the max pressure of the entire region.		// Avoid increasing the max pressure of the entire region.
if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,		if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
Cand.RPDelta.CurrentMax,		Cand.RPDelta.CurrentMax,
TryCand, Cand, RegMax, TRI,		TryCand, Cand, RegMax, TRI,
DAG->MF))		DAG->MF))
return;		return;

		if (SameBoundary) {
// Avoid critical resource consumption and balance the schedule.		// Avoid critical resource consumption and balance the schedule.
TryCand.initResourceDelta(DAG, SchedModel);		TryCand.initResourceDelta(DAG, SchedModel);
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,		if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))		TryCand, Cand, ResourceReduce))
return;		return;
if (tryGreater(TryCand.ResDelta.DemandedResources,		if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources,		Cand.ResDelta.DemandedResources,
TryCand, Cand, ResourceDemand))		TryCand, Cand, ResourceDemand))
return;		return;

// Avoid serializing long latency dependence chains.		// Avoid serializing long latency dependence chains.
// For acyclic path limited loops, latency was already checked above.		// For acyclic path limited loops, latency was already checked above.
if (!RegionPolicy.DisableLatencyHeuristic && Cand.Policy.ReduceLatency &&		if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone)) {		!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
return;		return;
}

// Prefer immediate defs/users of the last scheduled instruction. This is a		// Prefer immediate defs/users of the last scheduled instruction. This is a
// local pressure avoidance strategy that also makes the machine code		// local pressure avoidance strategy that also makes the machine code
// readable.		// readable.
if (tryGreater(Zone.isNextSU(TryCand.SU), Zone.isNextSU(Cand.SU),		if (tryGreater(Zone->isNextSU(TryCand.SU), Zone->isNextSU(Cand.SU),
TryCand, Cand, NextDefUse))		TryCand, Cand, NextDefUse))
return;		return;

// Fall through to original instruction order.		// Fall through to original instruction order.
if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)		if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
\|\| (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {		\|\| (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
TryCand.Reason = NodeOrder;		TryCand.Reason = NodeOrder;
}		}
}		}
		}

/// Pick the best candidate from the queue.		/// Pick the best candidate from the queue.
///		///
/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during		/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
/// DAG building. To adjust for the current scheduling location we need to		/// DAG building. To adjust for the current scheduling location we need to
/// maintain the number of vreg uses remaining to be top-scheduled.		/// maintain the number of vreg uses remaining to be top-scheduled.
void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,		void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
		const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,		const RegPressureTracker &RPTracker,
SchedCandidate &Cand) {		SchedCandidate &Cand) {
// getMaxPressureDelta temporarily modifies the tracker.		// getMaxPressureDelta temporarily modifies the tracker.
RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);		RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);

ReadyQueue &Q = Zone.Available;		ReadyQueue &Q = Zone.Available;
for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {		for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {

SchedCandidate TryCand(Cand.Policy);		SchedCandidate TryCand(ZonePolicy);
initCandidate(TryCand, *I, Zone.isTop(), RPTracker, TempTracker);		initCandidate(TryCand, *I, Zone.isTop(), RPTracker, TempTracker);
tryCandidate(Cand, TryCand, Zone);		// Pass SchedBoundary only when comparing nodes from the same boundary.
		SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
		tryCandidate(Cand, TryCand, ZoneArg);
if (TryCand.Reason != NoCand) {		if (TryCand.Reason != NoCand) {
// Initialize resource delta if needed in case future heuristics query it.		// Initialize resource delta if needed in case future heuristics query it.
if (TryCand.ResDelta == SchedResourceDelta())		if (TryCand.ResDelta == SchedResourceDelta())
TryCand.initResourceDelta(DAG, SchedModel);		TryCand.initResourceDelta(DAG, SchedModel);
Cand.setBest(TryCand);		Cand.setBest(TryCand);
DEBUG(traceCandidate(Cand));		DEBUG(traceCandidate(Cand));
}		}
}		}
}		}

/// Pick the best candidate node from either the top or bottom queue.		/// Pick the best candidate node from either the top or bottom queue.
SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {		SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
// Schedule as far as possible in the direction of no choice. This is most		// Schedule as far as possible in the direction of no choice. This is most
// efficient, but also provides the best heuristics for CriticalPSets.		// efficient, but also provides the best heuristics for CriticalPSets.
if (SUnit *SU = Bot.pickOnlyChoice()) {		if (SUnit *SU = Bot.pickOnlyChoice()) {
IsTopNode = false;		IsTopNode = false;
tracePick(Only1, false);		tracePick(Only1, false);
return SU;		return SU;
}		}
if (SUnit *SU = Top.pickOnlyChoice()) {		if (SUnit *SU = Top.pickOnlyChoice()) {
IsTopNode = true;		IsTopNode = true;
tracePick(Only1, true);		tracePick(Only1, true);
return SU;		return SU;
}		}
CandPolicy NoPolicy;
SchedCandidate BotCand(NoPolicy);
SchedCandidate TopCand(NoPolicy);
// Set the bottom-up policy based on the state of the current bottom zone and		// Set the bottom-up policy based on the state of the current bottom zone and
// the instructions outside the zone, including the top zone.		// the instructions outside the zone, including the top zone.
setPolicy(BotCand.Policy, /IsPostRA=/false, Bot, &Top);		CandPolicy BotPolicy;
		setPolicy(BotPolicy, /IsPostRA=/false, Bot, &Top);
// Set the top-down policy based on the state of the current top zone and		// Set the top-down policy based on the state of the current top zone and
// the instructions outside the zone, including the bottom zone.		// the instructions outside the zone, including the bottom zone.
setPolicy(TopCand.Policy, /IsPostRA=/false, Top, &Bot);		CandPolicy TopPolicy;
		setPolicy(TopPolicy, /IsPostRA=/false, Top, &Bot);

// Prefer bottom scheduling when heuristics are silent.		// Prefer bottom scheduling when heuristics are silent.
		CandPolicy NoPolicy;
		SchedCandidate Cand(NoPolicy);
DEBUG(dbgs() << "Picking from Bot:\n");		DEBUG(dbgs() << "Picking from Bot:\n");
pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);		pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), Cand);
assert(BotCand.Reason != NoCand && "failed to find the first candidate");		assert(Cand.Reason != NoCand && "failed to find the first candidate");

// If either Q has a single candidate that provides the least increase in
// Excess pressure, we can immediately schedule from that Q.
//
// RegionCriticalPSets summarizes the pressure within the scheduled region and
// affects picking from either Q. If scheduling in one direction must
// increase pressure for one of the excess PSets, then schedule in that
// direction first to provide more freedom in the other direction.
if ((BotCand.Reason == RegExcess && !BotCand.isRepeat(RegExcess))
\|\| (BotCand.Reason == RegCritical && !BotCand.isRepeat(RegCritical)))
{
IsTopNode = false;
tracePick(BotCand, IsTopNode);
return BotCand.SU;
}
// Check if the top Q has a better candidate.		// Check if the top Q has a better candidate.
DEBUG(dbgs() << "Picking from Top:\n");		DEBUG(dbgs() << "Picking from Top:\n");
pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);		pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), Cand);
assert(TopCand.Reason != NoCand && "failed to find the first candidate");		assert(Cand.Reason != NoCand && "failed to find the first candidate");

// Choose the queue with the most important (lowest enum) reason.		IsTopNode = Cand.AtTop;
if (TopCand.Reason < BotCand.Reason) {		tracePick(Cand);
IsTopNode = true;		return Cand.SU;
tracePick(TopCand, IsTopNode);
return TopCand.SU;
}
// Otherwise prefer the bottom candidate, in node order if all else failed.
IsTopNode = false;
tracePick(BotCand, IsTopNode);
return BotCand.SU;
}		}

/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.		/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
SUnit *GenericScheduler::pickNode(bool &IsTopNode) {		SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
if (DAG->top() == DAG->bottom()) {		if (DAG->top() == DAG->bottom()) {
assert(Top.Available.empty() && Top.Pending.empty() &&		assert(Top.Available.empty() && Top.Pending.empty() &&
Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");		Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
return nullptr;		return nullptr;
}		}
SUnit *SU;		SUnit *SU;
do {		do {
if (RegionPolicy.OnlyTopDown) {		if (RegionPolicy.OnlyTopDown) {
SU = Top.pickOnlyChoice();		SU = Top.pickOnlyChoice();
if (!SU) {		if (!SU) {
CandPolicy NoPolicy;		CandPolicy NoPolicy;
SchedCandidate TopCand(NoPolicy);		SchedCandidate TopCand(NoPolicy);
pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);		pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
assert(TopCand.Reason != NoCand && "failed to find a candidate");		assert(TopCand.Reason != NoCand && "failed to find a candidate");
tracePick(TopCand, true);		tracePick(TopCand);
SU = TopCand.SU;		SU = TopCand.SU;
}		}
IsTopNode = true;		IsTopNode = true;
} else if (RegionPolicy.OnlyBottomUp) {		} else if (RegionPolicy.OnlyBottomUp) {
SU = Bot.pickOnlyChoice();		SU = Bot.pickOnlyChoice();
if (!SU) {		if (!SU) {
CandPolicy NoPolicy;		CandPolicy NoPolicy;
SchedCandidate BotCand(NoPolicy);		SchedCandidate BotCand(NoPolicy);
pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);		pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
assert(BotCand.Reason != NoCand && "failed to find a candidate");		assert(BotCand.Reason != NoCand && "failed to find a candidate");
tracePick(BotCand, false);		tracePick(BotCand);
SU = BotCand.SU;		SU = BotCand.SU;
}		}
IsTopNode = false;		IsTopNode = false;
} else {		} else {
SU = pickNodeBidirectional(IsTopNode);		SU = pickNodeBidirectional(IsTopNode);
}		}
} while (SU->isScheduled);		} while (SU->isScheduled);

▲ Show 20 Lines • Show All 153 Lines • ▼ Show 20 Lines	if (TryCand.SU->NodeNum < Cand.SU->NodeNum)
TryCand.Reason = NodeOrder;		TryCand.Reason = NodeOrder;
}		}

void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {		void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
ReadyQueue &Q = Top.Available;		ReadyQueue &Q = Top.Available;
for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {		for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
SchedCandidate TryCand(Cand.Policy);		SchedCandidate TryCand(Cand.Policy);
TryCand.SU = *I;		TryCand.SU = *I;
		TryCand.AtTop = true;
TryCand.initResourceDelta(DAG, SchedModel);		TryCand.initResourceDelta(DAG, SchedModel);
tryCandidate(Cand, TryCand);		tryCandidate(Cand, TryCand);
if (TryCand.Reason != NoCand) {		if (TryCand.Reason != NoCand) {
Cand.setBest(TryCand);		Cand.setBest(TryCand);
DEBUG(traceCandidate(Cand));		DEBUG(traceCandidate(Cand));
}		}
}		}
}		}
Show All 12 Lines	do {
} else {		} else {
CandPolicy NoPolicy;		CandPolicy NoPolicy;
SchedCandidate TopCand(NoPolicy);		SchedCandidate TopCand(NoPolicy);
// Set the top-down policy based on the state of the current top zone and		// Set the top-down policy based on the state of the current top zone and
// the instructions outside the zone, including the bottom zone.		// the instructions outside the zone, including the bottom zone.
setPolicy(TopCand.Policy, /IsPostRA=/true, Top, nullptr);		setPolicy(TopCand.Policy, /IsPostRA=/true, Top, nullptr);
pickNodeFromQueue(TopCand);		pickNodeFromQueue(TopCand);
assert(TopCand.Reason != NoCand && "failed to find a candidate");		assert(TopCand.Reason != NoCand && "failed to find a candidate");
tracePick(TopCand, true);		tracePick(TopCand);
SU = TopCand.SU;		SU = TopCand.SU;
}		}
} while (SU->isScheduled);		} while (SU->isScheduled);

IsTopNode = true;		IsTopNode = true;
Top.removeReady(SU);		Top.removeReady(SU);

DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());		DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
▲ Show 20 Lines • Show All 304 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AArch64/arm64-convert-v4f64.ll

	Show All 17 Lines
	; CHECK-DAG: fcvtzs v[[CONV0:[0-9]+]].2d, v0.2d			; CHECK-DAG: fcvtzs v[[CONV0:[0-9]+]].2d, v0.2d
	; CHECK-DAG: fcvtzs v[[CONV1:[0-9]+]].2d, v1.2d			; CHECK-DAG: fcvtzs v[[CONV1:[0-9]+]].2d, v1.2d
	; CHECK-DAG: fcvtzs v[[CONV2:[0-9]+]].2d, v2.2d			; CHECK-DAG: fcvtzs v[[CONV2:[0-9]+]].2d, v2.2d
	; CHECK-DAG: fcvtzs v[[CONV3:[0-9]+]].2d, v3.2d			; CHECK-DAG: fcvtzs v[[CONV3:[0-9]+]].2d, v3.2d
	; CHECK-DAG: xtn v[[NA2:[0-9]+]].2s, v[[CONV2]].2d			; CHECK-DAG: xtn v[[NA2:[0-9]+]].2s, v[[CONV2]].2d
	; CHECK-DAG: xtn2 v[[NA2]].4s, v[[CONV3]].2d			; CHECK-DAG: xtn2 v[[NA2]].4s, v[[CONV3]].2d
	; CHECK-DAG: xtn v[[NA0:[0-9]+]].2s, v[[CONV0]].2d			; CHECK-DAG: xtn v[[NA0:[0-9]+]].2s, v[[CONV0]].2d
	; CHECK-DAG: xtn2 v[[NA0]].4s, v[[CONV1]].2d			; CHECK-DAG: xtn2 v[[NA0]].4s, v[[CONV1]].2d
	; CHECK-DAG: xtn v[[TMP1:[0-9]+]].4h, v[[NA0]].4s			; CHECK-DAG: xtn v[[TMP1:[0-9]+]].4h, v[[NA2]].4s
	; CHECK-DAG: xtn2 v[[TMP1]].8h, v[[NA2]].4s			; CHECK-DAG: xtn2 v[[TMP1]].8h, v[[NA0]].4s
	; CHECK: xtn v0.8b, v[[TMP1]].8h			; CHECK: xtn v0.8b, v[[TMP1]].8h
	%tmp1 = load <8 x double>, <8 x double>* %ptr			%tmp1 = load <8 x double>, <8 x double>* %ptr
	%tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>			%tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
	ret <8 x i8> %tmp2			ret <8 x i8> %tmp2
	}			}

	define <4 x half> @uitofp_v4i64_to_v4f16(<4 x i64>* %ptr) {			define <4 x half> @uitofp_v4i64_to_v4f16(<4 x i64>* %ptr) {
	; CHECK: uitofp_v4i64_to_v4f16			; CHECK: uitofp_v4i64_to_v4f16
	Show All 31 Lines

llvm/trunk/test/CodeGen/AArch64/bitreverse.ll

	Show First 20 Lines • Show All 46 Lines • ▼ Show 20 Lines
	define <8 x i8> @g_vec(<8 x i8> %a) {			define <8 x i8> @g_vec(<8 x i8> %a) {
	; Try and match as much of the sequence as precisely as possible.			; Try and match as much of the sequence as precisely as possible.

	; CHECK-LABEL: g_vec:			; CHECK-LABEL: g_vec:
	; CHECK-DAG: movi [[M1:v.*]], #128			; CHECK-DAG: movi [[M1:v.*]], #128
	; CHECK-DAG: movi [[M2:v.*]], #64			; CHECK-DAG: movi [[M2:v.*]], #64
	; CHECK-DAG: movi [[M3:v.*]], #32			; CHECK-DAG: movi [[M3:v.*]], #32
	; CHECK-DAG: movi [[M4:v.*]], #16			; CHECK-DAG: movi [[M4:v.*]], #16
	; CHECK-DAG: movi [[M5:v.*]], #8			; CHECK-DAG: movi [[M5:v.*]], #8{{$}}
	; CHECK-DAG: movi [[M6:v.*]], #4{{$}}			; CHECK-DAG: movi [[M6:v.*]], #4{{$}}
	; CHECK-DAG: movi [[M7:v.*]], #2{{$}}			; CHECK-DAG: movi [[M7:v.*]], #2{{$}}
	; CHECK-DAG: movi [[M8:v.*]], #1{{$}}			; CHECK-DAG: movi [[M8:v.*]], #1{{$}}
	; CHECK-DAG: shl [[S1:v.*]], v0.8b, #7			; CHECK-DAG: shl [[S1:v.*]], v0.8b, #7
	; CHECK-DAG: shl [[S2:v.*]], v0.8b, #5			; CHECK-DAG: shl [[S2:v.*]], v0.8b, #5
	; CHECK-DAG: shl [[S3:v.*]], v0.8b, #3			; CHECK-DAG: shl [[S3:v.*]], v0.8b, #3
	; CHECK-DAG: shl [[S4:v.*]], v0.8b, #1			; CHECK-DAG: shl [[S4:v.*]], v0.8b, #1
	; CHECK-DAG: ushr [[S5:v.*]], v0.8b, #1			; CHECK-DAG: ushr [[S5:v.*]], v0.8b, #1
	Show All 25 Lines

llvm/trunk/test/CodeGen/AArch64/cxx-tlscc.ll

	Show All 38 Lines
	; CHECK-NOT: stp d21, d20			; CHECK-NOT: stp d21, d20
	; CHECK-NOT: stp d19, d18			; CHECK-NOT: stp d19, d18
	; CHECK-NOT: stp d17, d16			; CHECK-NOT: stp d17, d16
	; CHECK-NOT: stp d7, d6			; CHECK-NOT: stp d7, d6
	; CHECK-NOT: stp d5, d4			; CHECK-NOT: stp d5, d4
	; CHECK-NOT: stp d3, d2			; CHECK-NOT: stp d3, d2
	; CHECK-NOT: stp d1, d0			; CHECK-NOT: stp d1, d0
	; CHECK-NOT: stp x20, x19			; CHECK-NOT: stp x20, x19
	; CHECK-NOT: stp x14, x13			; FIXME: The splitting logic in the register allocator fails to split along
				; control flow here, we used to get this right by accident before...
				; CHECK-NOTXX: stp x14, x13
	; CHECK-NOT: stp x12, x11			; CHECK-NOT: stp x12, x11
	; CHECK-NOT: stp x10, x9			; CHECK-NOT: stp x10, x9
	; CHECK-NOT: stp x8, x7			; CHECK-NOT: stp x8, x7
	; CHECK-NOT: stp x6, x5			; CHECK-NOT: stp x6, x5
	; CHECK-NOT: stp x4, x3			; CHECK-NOT: stp x4, x3
	; CHECK-NOT: stp x2, x1			; CHECK-NOT: stp x2, x1
	; CHECK: blr			; CHECK: blr
	; CHECK: tbnz w{{.*}}, #0, [[BB_end:.?LBB0_[0-9]+]]			; CHECK: tbnz w{{.*}}, #0, [[BB_end:.?LBB0_[0-9]+]]
	; CHECK: blr			; CHECK: blr
	; CHECK: tlv_atexit			; CHECK: tlv_atexit
	; CHECK: [[BB_end]]:			; CHECK: [[BB_end]]:
	; CHECK: blr			; CHECK: blr
	; CHECK-NOT: ldp x2, x1			; CHECK-NOT: ldp x2, x1
	; CHECK-NOT: ldp x4, x3			; CHECK-NOT: ldp x4, x3
	; CHECK-NOT: ldp x6, x5			; CHECK-NOT: ldp x6, x5
	; CHECK-NOT: ldp x8, x7			; CHECK-NOT: ldp x8, x7
	; CHECK-NOT: ldp x10, x9			; CHECK-NOT: ldp x10, x9
	; CHECK-NOT: ldp x12, x11			; CHECK-NOT: ldp x12, x11
	; CHECK-NOT: ldp x14, x13			; CHECK-NOTXX: ldp x14, x13
	; CHECK-NOT: ldp x20, x19			; CHECK-NOT: ldp x20, x19
	; CHECK-NOT: ldp d1, d0			; CHECK-NOT: ldp d1, d0
	; CHECK-NOT: ldp d3, d2			; CHECK-NOT: ldp d3, d2
	; CHECK-NOT: ldp d5, d4			; CHECK-NOT: ldp d5, d4
	; CHECK-NOT: ldp d7, d6			; CHECK-NOT: ldp d7, d6
	; CHECK-NOT: ldp d17, d16			; CHECK-NOT: ldp d17, d16
	; CHECK-NOT: ldp d19, d18			; CHECK-NOT: ldp d19, d18
	; CHECK-NOT: ldp d21, d20			; CHECK-NOT: ldp d21, d20
	▲ Show 20 Lines • Show All 148 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AArch64/vcvt-oversize.ll

	; RUN: llc -mtriple=aarch64 < %s \| FileCheck %s			; RUN: llc -mtriple=aarch64 < %s \| FileCheck %s

	define <8 x i8> @float_to_i8(<8 x float>* %in) {			define <8 x i8> @float_to_i8(<8 x float>* %in) {
	; CHECK-LABEL: float_to_i8:			; CHECK-LABEL: float_to_i8:
	; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v0.4s, v0.4s			; CHECK: ldp q1, q0, [x0]
	; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v1.4s, v1.4s			; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v1.4s, v1.4s
				; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v0.4s, v0.4s
	; CHECK-DAG: fcvtzu v[[LSB2:[0-9]+]].4s, v[[LSB]].4s			; CHECK-DAG: fcvtzu v[[LSB2:[0-9]+]].4s, v[[LSB]].4s
	; CHECK-DAG: fcvtzu v[[MSB2:[0-9]+]].4s, v[[MSB]].4s			; CHECK-DAG: fcvtzu v[[MSB2:[0-9]+]].4s, v[[MSB]].4s
	; CHECK-DAG: xtn v[[TMP:[0-9]+]].4h, v[[LSB]].4s			; CHECK-DAG: xtn v[[TMP:[0-9]+]].4h, v[[LSB]].4s
	; CHECK-DAG: xtn2 v[[TMP]].8h, v[[MSB]].4s			; CHECK-DAG: xtn2 v[[TMP]].8h, v[[MSB]].4s
	; CHECK-DAG: xtn v0.8b, v[[TMP]].8h			; CHECK-DAG: xtn v0.8b, v[[TMP]].8h
	%l = load <8 x float>, <8 x float>* %in			%l = load <8 x float>, <8 x float>* %in
	%scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>			%scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>
	%conv = fptoui <8 x float> %scale to <8 x i8>			%conv = fptoui <8 x float> %scale to <8 x i8>
	ret <8 x i8> %conv			ret <8 x i8> %conv
	}			}

llvm/trunk/test/CodeGen/AArch64/vector-fcopysign.ll

Show First 20 Lines • Show All 88 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b)		%r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b)
ret <4 x float> %r		ret <4 x float> %r
}		}

; SplitVecOp #1		; SplitVecOp #1
define <4 x float> @test_copysign_v4f32_v4f64(<4 x float> %a, <4 x double> %b) #0 {		define <4 x float> @test_copysign_v4f32_v4f64(<4 x float> %a, <4 x double> %b) #0 {
; CHECK-LABEL: test_copysign_v4f32_v4f64:		; CHECK-LABEL: test_copysign_v4f32_v4f64:
; CHECK-NEXT: mov s3, v0[1]		; CHECK-NEXT: mov s3, v0[1]
; CHECK-NEXT: mov d4, v1[1]		; CHECK-NEXT: movi.4s v4, #128, lsl #24
; CHECK-NEXT: movi.4s v5, #128, lsl #24		; CHECK-NEXT: fcvt s5, d1
; CHECK-NEXT: fcvt s1, d1
; CHECK-NEXT: mov s6, v0[2]		; CHECK-NEXT: mov s6, v0[2]
; CHECK-NEXT: mov s7, v0[3]		; CHECK-NEXT: mov s7, v0[3]
; CHECK-NEXT: fcvt s16, d2		; CHECK-NEXT: bit.16b v0, v5, v4
; CHECK-NEXT: bit.16b v0, v1, v5		; CHECK-NEXT: fcvt s5, d2
; CHECK-NEXT: bit.16b v6, v16, v5		; CHECK-NEXT: bit.16b v6, v5, v4
; CHECK-NEXT: fcvt s1, d4		; CHECK-NEXT: mov d1, v1[1]
; CHECK-NEXT: bit.16b v3, v1, v5		; CHECK-NEXT: fcvt s1, d1
		; CHECK-NEXT: bit.16b v3, v1, v4
; CHECK-NEXT: mov d1, v2[1]		; CHECK-NEXT: mov d1, v2[1]
; CHECK-NEXT: fcvt s1, d1		; CHECK-NEXT: fcvt s1, d1
; CHECK-NEXT: ins.s v0[1], v3[0]		; CHECK-NEXT: ins.s v0[1], v3[0]
; CHECK-NEXT: ins.s v0[2], v6[0]		; CHECK-NEXT: ins.s v0[2], v6[0]
; CHECK-NEXT: bit.16b v7, v1, v5		; CHECK-NEXT: bit.16b v7, v1, v4
; CHECK-NEXT: ins.s v0[3], v7[0]		; CHECK-NEXT: ins.s v0[3], v7[0]
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%tmp0 = fptrunc <4 x double> %b to <4 x float>		%tmp0 = fptrunc <4 x double> %b to <4 x float>
%r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %tmp0)		%r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %tmp0)
ret <4 x float> %r		ret <4 x float> %r
}		}

declare <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b) #0		declare <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b) #0
▲ Show 20 Lines • Show All 59 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/and.ll

Show First 20 Lines • Show All 480 Lines • ▼ Show 20 Lines	define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
ret void		ret void
}		}


; Test with the 64-bit integer bitpattern for a 32-bit float in the		; Test with the 64-bit integer bitpattern for a 32-bit float in the
; low 32-bits, which is not a valid 64-bit inline immmediate.		; low 32-bits, which is not a valid 64-bit inline immmediate.

; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64:		; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64:
; SI: s_load_dwordx2
; SI: s_load_dword s		; SI: s_load_dword s
		; SI: s_load_dwordx2
; SI-NOT: and		; SI-NOT: and
; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0		; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
; SI-NOT: and		; SI-NOT: and
; SI: buffer_store_dwordx2		; SI: buffer_store_dwordx2
define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {		define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, 1082130432		%and = and i64 %a, 1082130432
store i64 %and, i64 addrspace(1)* %out, align 8		store i64 %and, i64 addrspace(1)* %out, align 8
ret void		ret void
▲ Show 20 Lines • Show All 42 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll

Show All 15 Lines	define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4		%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
%pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic		%pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
%result = extractvalue { i32, i1 } %pair, 0		%result = extractvalue { i32, i1 } %pair, 0
store i32 %result, i32 addrspace(1)* %out, align 4		store i32 %result, i32 addrspace(1)* %out, align 4
ret void		ret void
}		}

; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:		; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb		; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd		; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c		; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34		; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7		; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0		; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]		; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]		; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]		; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32		; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
▲ Show 20 Lines • Show All 59 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll

	Show First 20 Lines • Show All 149 Lines • ▼ Show 20 Lines
	define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {			define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
	%ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone			%ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
	%truncctpop = trunc i128 %ctpop to i32			%truncctpop = trunc i128 %ctpop to i32
	store i32 %truncctpop, i32 addrspace(1)* %out, align 4			store i32 %truncctpop, i32 addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}s_ctpop_i65:			; FUNC-LABEL: {{^}}s_ctpop_i65:
	; GCN: s_bcnt1_i32_b64
	; GCN: s_and_b32			; GCN: s_and_b32
	; GCN: s_bcnt1_i32_b64			; GCN: s_bcnt1_i32_b64 [[REG0:s[0-9]+]],
	; GCN: s_add_i32			; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]],
				; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]]
	; GCN: s_endpgm			; GCN: s_endpgm
	define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {			define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
	%ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone			%ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone
	%truncctpop = trunc i65 %ctpop to i32			%truncctpop = trunc i65 %ctpop to i32
	store i32 %truncctpop, i32 addrspace(1)* %out, align 4			store i32 %truncctpop, i32 addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	Show All 23 Lines

llvm/trunk/test/CodeGen/AMDGPU/ds_read2_offset_order.ll

	; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s \| FileCheck -strict-whitespace -check-prefix=SI %s			; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s \| FileCheck -strict-whitespace -check-prefix=SI %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt < %s \| FileCheck -strict-whitespace -check-prefix=SI %s			; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt < %s \| FileCheck -strict-whitespace -check-prefix=SI %s


	@lds = addrspace(3) global [512 x float] undef, align 4			@lds = addrspace(3) global [512 x float] undef, align 4

	; offset0 is larger than offset1			; offset0 is larger than offset1

	; SI-LABEL: {{^}}offset_order:			; SI-LABEL: {{^}}offset_order:

	; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:4{{$}}
	; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3			; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
	; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:14 offset1:12			; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:14 offset1:12
	; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44			; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44

	define void @offset_order(float addrspace(1)* %out) {			define void @offset_order(float addrspace(1)* %out) {
	entry:			entry:
	%ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0			%ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0
	%val0 = load float, float addrspace(3)* %ptr0			%val0 = load float, float addrspace(3)* %ptr0
	Show All 27 Lines

llvm/trunk/test/CodeGen/AMDGPU/ds_read2st64.ll

Show First 20 Lines • Show All 191 Lines • ▼ Show 20 Lines	define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
%sum = fadd double %val0, %val1		%sum = fadd double %val0, %val1
%out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i		%out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
store double %sum, double addrspace(1)* %out.gep, align 8		store double %sum, double addrspace(1)* %out.gep, align 8
ret void		ret void
}		}

; SI-LABEL: @simple_read2st64_f64_over_max_offset		; SI-LABEL: @simple_read2st64_f64_over_max_offset
; SI-NOT: ds_read2st64_b64		; SI-NOT: ds_read2st64_b64
; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512		; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
		; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]		; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
; SI: s_endpgm		; SI: s_endpgm
define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {		define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64		%add.x.0 = add nsw i32 %x.i, 64
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0		%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
%val0 = load double, double addrspace(3)* %arrayidx0, align 8		%val0 = load double, double addrspace(3)* %arrayidx0, align 8
%add.x.1 = add nsw i32 %x.i, 8192		%add.x.1 = add nsw i32 %x.i, 8192
▲ Show 20 Lines • Show All 53 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f64.ll

Show First 20 Lines • Show All 49 Lines • ▼ Show 20 Lines	define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
%bc = bitcast i64 %in to double		%bc = bitcast i64 %in to double
%fabs = call double @fabs(double %bc)		%fabs = call double @fabs(double %bc)
%fsub = fsub double -0.000000e+00, %fabs		%fsub = fsub double -0.000000e+00, %fabs
store double %fsub, double addrspace(1)* %out		store double %fsub, double addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}fneg_fabs_f64:		; GCN-LABEL: {{^}}fneg_fabs_f64:
; GCN: s_load_dwordx2		; GCN-DAG: s_load_dwordx2
; GCN-DAG: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}		; GCN-DAG: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb		; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c		; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]		; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]		; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}		; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {		define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
%fabs = call double @llvm.fabs.f64(double %in)		%fabs = call double @llvm.fabs.f64(double %in)
Show All 35 Lines

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll

Show First 20 Lines • Show All 173 Lines • ▼ Show 20 Lines	entry:
ret void		ret void
}		}

; When the block is split to insert the loop, make sure any other		; When the block is split to insert the loop, make sure any other
; places that need to be expanded in the same block are also handled.		; places that need to be expanded in the same block are also handled.

; CHECK-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:		; CHECK-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:

; CHECK: {{buffer\|flat}}_load_dword [[IDX0:v[0-9]+]]		; CHECK-DAG: {{buffer\|flat}}_load_dword [[IDX0:v[0-9]+]]
; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7		; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9		; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9
; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]		; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]		; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
; CHECK: s_waitcnt vmcnt(0)		; CHECK: s_waitcnt vmcnt(0)

; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec		; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec

; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:		; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]		; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
; CHECK: s_mov_b32 m0, vcc_lo		; CHECK: s_mov_b32 m0, vcc_lo
; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]		; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
; CHECK: s_and_saveexec_b64 vcc, vcc		; CHECK: s_and_saveexec_b64 vcc, vcc
; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]		; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
; CHECK-NEXT: s_xor_b64 exec, exec, vcc		; CHECK-NEXT: s_xor_b64 exec, exec, vcc
; CHECK: s_cbranch_execnz [[LOOP0]]		; CHECK: s_cbranch_execnz [[LOOP0]]

; FIXME: Redundant copy		; FIXME: Redundant copy
; CHECK: s_mov_b64 exec, [[MASK]]		; CHECK: s_mov_b64 exec, [[MASK]]
; CHECK: s_mov_b64 [[MASK]], exec		; CHECK: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec

; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]:		; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]:
; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]		; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
; CHECK: s_mov_b32 m0, vcc_lo		; CHECK: s_mov_b32 m0, vcc_lo
; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]		; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
; CHECK: s_and_saveexec_b64 vcc, vcc		; CHECK: s_and_saveexec_b64 vcc, vcc
; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1]]		; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1]]
; CHECK-NEXT: s_xor_b64 exec, exec, vcc		; CHECK-NEXT: s_xor_b64 exec, exec, vcc
▲ Show 20 Lines • Show All 196 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll

	; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s			; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s
	; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=+max-private-element-size-16 < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s			; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=+max-private-element-size-16 < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s

	; FIXME: Broken on evergreen			; FIXME: Broken on evergreen
	; FIXME: For some reason the 8 and 16 vectors are being stored as			; FIXME: For some reason the 8 and 16 vectors are being stored as
	; individual elements instead of 128-bit stores.			; individual elements instead of 128-bit stores.


	; FIXME: Why is the constant moved into the intermediate register and			; FIXME: Why is the constant moved into the intermediate register and
	; not just directly into the vector component?			; not just directly into the vector component?

	; GCN-LABEL: {{^}}insertelement_v4f32_0:			; GCN-LABEL: {{^}}insertelement_v4f32_0:
	; GCN: s_load_dwordx4 s{{\[}}[[LOW_REG:[0-9]+]]:			; GCN: s_load_dwordx4
	; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}			; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
	; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}			; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
	; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}			; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
	; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}			; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
	; GCN-DAG: v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 0x40a00000			; GCN-DAG: v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 0x40a00000
	; GCN-DAG: v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]]			; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]]
	; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]:			; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]:
	define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {			define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
	%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0			%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
	store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16			store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}insertelement_v4f32_1:			; GCN-LABEL: {{^}}insertelement_v4f32_1:
	▲ Show 20 Lines • Show All 414 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll

	; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=VI -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=VI -check-prefix=FUNC %s

	declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone			declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone

	; FUNC-LABEL: {{^}}rsq_clamped_f64:			; FUNC-LABEL: {{^}}rsq_clamped_f64:
	; SI: v_rsq_clamp_f64_e32			; SI: v_rsq_clamp_f64_e32

	; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}]			; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}]
	; TODO: this constant should be folded:			; TODO: this constant should be folded:
	; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1			; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1
	; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff			; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
	; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
	; VI-DAG: s_mov_b32 s[[LOW2:[0-9+]]], s[[LOW1]]
	; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]			; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
	; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]			; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
				; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]]

	define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {			define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
	%rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone			%rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
	store double %rsq_clamped, double addrspace(1)* %out, align 8			store double %rsq_clamped, double addrspace(1)* %out, align 8
	ret void			ret void
	}			}

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll

	Show All 23 Lines
	; FUNC-LABEL: {{^}}rsq_clamp_f64:			; FUNC-LABEL: {{^}}rsq_clamp_f64:
	; SI: v_rsq_clamp_f64_e32			; SI: v_rsq_clamp_f64_e32

	; TODO: this constant should be folded:			; TODO: this constant should be folded:
	; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1			; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1
	; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff			; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
	; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff			; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
	; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}			; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
	; VI-DAG: s_mov_b32 s[[LOW2:[0-9+]]], s[[LOW1]]
	; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]			; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
	; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]			; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]]
	define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {			define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
	%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)			%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
	store double %rsq_clamp, double addrspace(1)* %out			store double %rsq_clamp, double addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}rsq_clamp_undef_f32:			; FUNC-LABEL: {{^}}rsq_clamp_undef_f32:
	; SI-NOT: v_rsq_clamp_f32			; SI-NOT: v_rsq_clamp_f32
	define void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 {			define void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 {
	%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)			%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
	store float %rsq_clamp, float addrspace(1)* %out			store float %rsq_clamp, float addrspace(1)* %out
	ret void			ret void
	}			}

	attributes #0 = { nounwind }			attributes #0 = { nounwind }
	attributes #1 = { nounwind readnone }			attributes #1 = { nounwind readnone }

llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll

	Show All 26 Lines

	; Make sure the lds reads are using different addresses, at different			; Make sure the lds reads are using different addresses, at different
	; constant offsets.			; constant offsets.
	; EG: LDS_READ_RET {{[]}} OQAP, {{PV\|T}}[[ADDRR:[0-9]*\.[XYZW]]]			; EG: LDS_READ_RET {{[]}} OQAP, {{PV\|T}}[[ADDRR:[0-9]*\.[XYZW]]]
	; EG-NOT: LDS_READ_RET {{[]}} OQAP, T[[ADDRR]]			; EG-NOT: LDS_READ_RET {{[]}} OQAP, T[[ADDRR]]


	; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0			; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
	; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]*}}, {{v[0-9]+}} offset0:4			; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
				; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}


	; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]			; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]

	; SI-DAG: ds_write_b32 [[ADDRW]],			; SI-DAG: ds_write_b32 [[ADDRW]],
	; SI-DAG: ds_write_b32 [[ADDRW_OFF]],			; SI-DAG: ds_write_b32 [[ADDRW_OFF]],

	; GCN: s_barrier			; GCN: s_barrier
	Show All 36 Lines

llvm/trunk/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll

	; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s \| FileCheck -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s \| FileCheck -check-prefix=GCN %s

	; Check that when mubuf addr64 instruction is handled in moveToVALU			; Check that when mubuf addr64 instruction is handled in moveToVALU
	; from the pointer, dead register writes are not emitted.			; from the pointer, dead register writes are not emitted.

	; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32			; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32

	; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add:			; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add:
	; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}			; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
	; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}			; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}

	; GCN-NOT: v_mov_b32			; GCN-NOT: v_mov_b32
	; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
	; GCN-NOT: v_mov_b32
	; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]			; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
	; GCN-NOT: v_mov_b32			; GCN-NOT: v_mov_b32
				; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
				; GCN-NOT: v_mov_b32

	; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]			; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]
	; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]			; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
	; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}},			; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}},

	define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {			define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
	bb:			bb:
	%tmp = icmp sgt i32 %arg3, 0			%tmp = icmp sgt i32 %arg3, 0
	Show All 13 Lines

llvm/trunk/test/CodeGen/AMDGPU/sra.ll

Show First 20 Lines • Show All 222 Lines • ▼ Show 20 Lines	define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
%gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid		%gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
%a = load i64, i64 addrspace(1)* %gep.in		%a = load i64, i64 addrspace(1)* %gep.in
%result = ashr i64 %a, 32		%result = ashr i64 %a, 32
store i64 %result, i64 addrspace(1)* %gep.out		store i64 %result, i64 addrspace(1)* %gep.out
ret void		ret void
}		}

; GCN-LABEL: {{^}}s_ashr_63_i64:		; GCN-LABEL: {{^}}s_ashr_63_i64:
; GCN-DAG: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc\|0x30}}		; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc\|0x30}}
; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31		; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
; GCN: s_add_u32 {{s[0-9]+}}, s[[HI]], {{s[0-9]+}}		; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}		; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {		define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
%result = ashr i64 %a, 63		%result = ashr i64 %a, 63
%add = add i64 %result, %b		%add = add i64 %result, %b
store i64 %add, i64 addrspace(1)* %out		store i64 %add, i64 addrspace(1)* %out
ret void		ret void
}		}

Show All 17 Lines

llvm/trunk/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll

	Show First 20 Lines • Show All 623 Lines • ▼ Show 20 Lines
	@ht = common global i32* null, align 8			@ht = common global i32* null, align 8
	@he = common global i8* null, align 8			@he = common global i8* null, align 8

	; Test for a bug that was caused when save point was equal to restore point.			; Test for a bug that was caused when save point was equal to restore point.
	; Function Attrs: nounwind			; Function Attrs: nounwind
	; CHECK-LABEL: transpose			; CHECK-LABEL: transpose
	;			;
	; Store of callee-save register saved by shrink wrapping			; Store of callee-save register saved by shrink wrapping
	; CHECK: std [[CSR:[0-9]+]], -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill			; FIXME: Test disabled: Improved scheduling needs no spills/reloads any longer!
				; CHECKXX: std [[CSR:[0-9]+]], -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
	;			;
	; Reload of callee-save register			; Reload of callee-save register
	; CHECK: ld [[CSR]], -[[STACK_OFFSET]](1) # 8-byte Folded Reload			; CHECKXX: ld [[CSR]], -[[STACK_OFFSET]](1) # 8-byte Folded Reload
	;			;
	; Ensure no subsequent uses of callee-save register before end of function			; Ensure no subsequent uses of callee-save register before end of function
	; CHECK-NOT: {{[a-z]+}} [[CSR]]			; CHECK-NOT: {{[a-z]+}} [[CSR]]
	; CHECK: blr			; CHECK: blr
	define signext i32 @transpose() {			define signext i32 @transpose() {
	entry:			entry:
	%0 = load i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @columns, i64 0, i64 1), align 4			%0 = load i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @columns, i64 0, i64 1), align 4
	%shl.i = shl i32 %0, 7			%shl.i = shl i32 %0, 7
	▲ Show 20 Lines • Show All 141 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/PowerPC/ppc64-byval-align.ll

	Show All 29 Lines

	define i64 @callee2(%struct.pad* byval nocapture readnone %x, i32 signext %y, %struct.test* byval align 16 nocapture readonly %z) {			define i64 @callee2(%struct.pad* byval nocapture readnone %x, i32 signext %y, %struct.test* byval align 16 nocapture readonly %z) {
	entry:			entry:
	%x1 = getelementptr inbounds %struct.test, %struct.test* %z, i64 0, i32 0			%x1 = getelementptr inbounds %struct.test, %struct.test* %z, i64 0, i32 0
	%0 = load i64, i64* %x1, align 16			%0 = load i64, i64* %x1, align 16
	ret i64 %0			ret i64 %0
	}			}
	; CHECK-LABEL: @callee2			; CHECK-LABEL: @callee2
	; CHECK: ld [[REG:[0-9]+]], 128(1)			; CHECK: ld 3, 128(1)
	; CHECK: mr 3, [[REG]]
	; CHECK: blr			; CHECK: blr

	declare i64 @test2(%struct.pad* byval, i32 signext, %struct.test* byval align 16)			declare i64 @test2(%struct.pad* byval, i32 signext, %struct.test* byval align 16)
	define void @caller2(i64 %z) {			define void @caller2(i64 %z) {
	entry:			entry:
	%tmp = alloca %struct.test, align 16			%tmp = alloca %struct.test, align 16
	%.compoundliteral.sroa.0.0..sroa_idx = getelementptr inbounds %struct.test, %struct.test* %tmp, i64 0, i32 0			%.compoundliteral.sroa.0.0..sroa_idx = getelementptr inbounds %struct.test, %struct.test* %tmp, i64 0, i32 0
	store i64 %z, i64* %.compoundliteral.sroa.0.0..sroa_idx, align 16			store i64 %z, i64* %.compoundliteral.sroa.0.0..sroa_idx, align 16
	Show All 11 Lines

This is an archive of the discontinued LLVM Phabricator instance.

MachineScheduler: Fully compare top/bottom candidates
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 61862

llvm/trunk/include/llvm/CodeGen/MachineScheduler.h

llvm/trunk/lib/CodeGen/MachineScheduler.cpp

llvm/trunk/test/CodeGen/AArch64/arm64-convert-v4f64.ll

llvm/trunk/test/CodeGen/AArch64/bitreverse.ll

llvm/trunk/test/CodeGen/AArch64/cxx-tlscc.ll

llvm/trunk/test/CodeGen/AArch64/vcvt-oversize.ll

llvm/trunk/test/CodeGen/AArch64/vector-fcopysign.ll

llvm/trunk/test/CodeGen/AMDGPU/and.ll

llvm/trunk/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll

llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll

llvm/trunk/test/CodeGen/AMDGPU/ds_read2_offset_order.ll

llvm/trunk/test/CodeGen/AMDGPU/ds_read2st64.ll

llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f64.ll

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll

llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll

llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll

llvm/trunk/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll

llvm/trunk/test/CodeGen/AMDGPU/sra.ll

llvm/trunk/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll

llvm/trunk/test/CodeGen/PowerPC/ppc64-byval-align.ll

This is an archive of the discontinued LLVM Phabricator instance.

MachineScheduler: Fully compare top/bottom candidatesClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 61862

llvm/trunk/include/llvm/CodeGen/MachineScheduler.h

llvm/trunk/lib/CodeGen/MachineScheduler.cpp

llvm/trunk/test/CodeGen/AArch64/arm64-convert-v4f64.ll

llvm/trunk/test/CodeGen/AArch64/bitreverse.ll

llvm/trunk/test/CodeGen/AArch64/cxx-tlscc.ll

llvm/trunk/test/CodeGen/AArch64/vcvt-oversize.ll

llvm/trunk/test/CodeGen/AArch64/vector-fcopysign.ll

llvm/trunk/test/CodeGen/AMDGPU/and.ll

llvm/trunk/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll

llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll

llvm/trunk/test/CodeGen/AMDGPU/ds_read2_offset_order.ll

llvm/trunk/test/CodeGen/AMDGPU/ds_read2st64.ll

llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f64.ll

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll

llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll

llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll

llvm/trunk/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll

llvm/trunk/test/CodeGen/AMDGPU/sra.ll

llvm/trunk/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll

llvm/trunk/test/CodeGen/PowerPC/ppc64-byval-align.ll

MachineScheduler: Fully compare top/bottom candidates
ClosedPublic