Diff 527850

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

Show First 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	enum class SchedGroupMask {
DS = 1u << 7,		DS = 1u << 7,
DS_READ = 1u << 8,		DS_READ = 1u << 8,
DS_WRITE = 1u << 9,		DS_WRITE = 1u << 9,
ALL = ALU \| VALU \| SALU \| MFMA \| VMEM \| VMEM_READ \| VMEM_WRITE \| DS \|		ALL = ALU \| VALU \| SALU \| MFMA \| VMEM \| VMEM_READ \| VMEM_WRITE \| DS \|
DS_READ \| DS_WRITE,		DS_READ \| DS_WRITE,
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)		LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};		};

		class SchedGroup;

typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;		typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;

		typedef function_ref<bool(const SUnit , const ArrayRef<SUnit >,
		const SIInstrInfo *, SmallVectorImpl<SchedGroup> &,
		unsigned)>
		InstructionRuleType;

// Classify instructions into groups to enable fine tuned control over the		// Classify instructions into groups to enable fine tuned control over the
// scheduler. These groups may be more specific than current SchedModel		// scheduler. These groups may be more specific than current SchedModel
// instruction classes.		// instruction classes.
class SchedGroup {		class SchedGroup {
private:		private:
// Mask that defines which instruction types can be classified into this		// Mask that defines which instruction types can be classified into this
// SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER		// SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
// and SCHED_GROUP_BARRIER.		// and SCHED_GROUP_BARRIER.
SchedGroupMask SGMask;		SchedGroupMask SGMask;

// Maximum number of SUnits that can be added to this group.		// Maximum number of SUnits that can be added to this group.
std::optional<unsigned> MaxSize;		std::optional<unsigned> MaxSize;

// SchedGroups will only synchronize with other SchedGroups that have the same		// SchedGroups will only synchronize with other SchedGroups that have the same
// SyncID.		// SyncID.
int SyncID = 0;		int SyncID = 0;

// SGID is used to map instructions to candidate SchedGroups		// SGID is used to map instructions to candidate SchedGroups
unsigned SGID;		unsigned SGID;

		// The different rules each instruction in this SchedGroup must conform to
		std::optional<SmallVector<InstructionRuleType, 4>> Rules;

// Count of the number of created SchedGroups, used to initialize SGID.		// Count of the number of created SchedGroups, used to initialize SGID.
static unsigned NumSchedGroups;		static unsigned NumSchedGroups;

ScheduleDAGInstrs *DAG;

const SIInstrInfo *TII;		const SIInstrInfo *TII;

// Try to add and edge from SU A to SU B.		// Try to add and edge from SU A to SU B.
bool tryAddEdge(SUnit A, SUnit B);		bool tryAddEdge(SUnit A, SUnit B);

// Use SGMask to determine whether we can classify MI as a member of this		// Use SGMask to determine whether we can classify MI as a member of this
// SchedGroup object.		// SchedGroup object.
bool canAddMI(const MachineInstr &MI) const;		bool canAddMI(const MachineInstr &MI) const;

public:		public:
// Collection of SUnits that are classified as members of this group.		// Collection of SUnits that are classified as members of this group.
SmallVector<SUnit *, 32> Collection;		SmallVector<SUnit *, 32> Collection;

		ScheduleDAGInstrs *DAG;

// Returns true if SU can be added to this SchedGroup.		// Returns true if SU can be added to this SchedGroup.
bool canAddSU(SUnit &SU) const;		bool canAddSU(SUnit &SU) const;

// Add DAG dependencies from all SUnits in this SchedGroup and this SU. If		// Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
// MakePred is true, SU will be a predecessor of the SUnits in this		// MakePred is true, SU will be a predecessor of the SUnits in this
// SchedGroup, otherwise SU will be a successor.		// SchedGroup, otherwise SU will be a successor.
void link(SUnit &SU, bool MakePred = false);		void link(SUnit &SU, bool MakePred = false);

Show All 9 Lines	public:

// Add DAG dependencies such that SUnits in this group shall be ordered		// Add DAG dependencies such that SUnits in this group shall be ordered
// before SUnits in OtherGroup.		// before SUnits in OtherGroup.
void link(SchedGroup &OtherGroup);		void link(SchedGroup &OtherGroup);

// Returns true if no more instructions may be added to this group.		// Returns true if no more instructions may be added to this group.
bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }		bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }

		// Returns true if the SU matches all rules
		bool allowedByRules(const SUnit *SU,
		SmallVectorImpl<SchedGroup> &SyncPipe) const {
		if (!Rules.has_value())
		return true;
		for (auto &Rule : *Rules) {
		if (!Rule(SU, Collection, TII, SyncPipe, SGID)) {
		return false;
		}
		}
		return true;
		}

// Add SU to the SchedGroup.		// Add SU to the SchedGroup.
void add(SUnit &SU) {		void add(SUnit &SU) {
LLVM_DEBUG(dbgs() << "For SchedGroup with mask "		LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
<< format_hex((int)SGMask, 10, true) << " adding "		<< format_hex((int)SGMask, 10, true) << " adding "
<< *SU.getInstr());		<< *SU.getInstr());
Collection.push_back(&SU);		Collection.push_back(&SU);
}		}

Show All 15 Lines	public:

int getSyncID() { return SyncID; }		int getSyncID() { return SyncID; }

int getSGID() { return SGID; }		int getSGID() { return SGID; }

SchedGroupMask getMask() { return SGMask; }		SchedGroupMask getMask() { return SGMask; }

SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,		SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
		std::optional<SmallVector<InstructionRuleType, 4>> Rules,
ScheduleDAGInstrs DAG, const SIInstrInfo TII)		ScheduleDAGInstrs DAG, const SIInstrInfo TII)
: SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {		: SGMask(SGMask), MaxSize(MaxSize), Rules(Rules), TII(TII), DAG(DAG) {
SGID = NumSchedGroups++;		SGID = NumSchedGroups++;
}		}

SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,		SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
ScheduleDAGInstrs DAG, const SIInstrInfo TII)		std::optional<SmallVector<InstructionRuleType, 4>> Rules,
: SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {		int SyncID, ScheduleDAGInstrs DAG, const SIInstrInfo TII)
		: SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), Rules(Rules),
		TII(TII), DAG(DAG) {
SGID = NumSchedGroups++;		SGID = NumSchedGroups++;
}		}
};		};

// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.		// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {		static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER \|\|		assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER \|\|
SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER \|\|		SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER \|\|
▲ Show 20 Lines • Show All 409 Lines • ▼ Show 20 Lines	for (; I != E; ++I) {
for (auto &SG : SyncPipeline) {		for (auto &SG : SyncPipeline) {
if (SG.getSGID() == CandSGID)		if (SG.getSGID() == CandSGID)
Match = &SG;		Match = &SG;
}		}

if (Match->isFull())		if (Match->isFull())
continue;		continue;

		if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
		continue;

LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "		LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
<< (int)Match->getMask() << "and ID " << CandSGID		<< (int)Match->getMask() << "and ID " << CandSGID
<< "\n");		<< "\n");
Match->add(*CurrSU.first);		Match->add(*CurrSU.first);
AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);		AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");		LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
CurrCost += AddedCost;		CurrCost += AddedCost;
advancePosition();		advancePosition();
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines	for (; I != E; ++I) {

LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "		LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
<< (int)Match->getMask() << "\n");		<< (int)Match->getMask() << "\n");

if (Match->isFull()) {		if (Match->isFull()) {
LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");		LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
continue;		continue;
}		}
		if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
		LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
		continue;
		}
TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);		TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");		LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
if (TempCost < BestNodeCost \|\| BestNodeCost == -1) {		if (TempCost < BestNodeCost \|\| BestNodeCost == -1) {
BestGroup = Match;		BestGroup = Match;
BestNodeCost = TempCost;		BestNodeCost = TempCost;
BestGroupID = CandSGID;		BestGroupID = CandSGID;
}		}
removeEdges(AddedEdges);		removeEdges(AddedEdges);
▲ Show 20 Lines • Show All 118 Lines • ▼ Show 20 Lines	void MFMASmallGemmOpt::applyIGLPStrategy(
for (const MachineInstr &I : *DAG)		for (const MachineInstr &I : *DAG)
if (TII->isMFMAorWMMA(I))		if (TII->isMFMAorWMMA(I))
++MFMACount;		++MFMACount;

const unsigned PipelineSyncID = 0;		const unsigned PipelineSyncID = 0;
SchedGroup *SG = nullptr;		SchedGroup *SG = nullptr;
for (unsigned I = 0; I < MFMACount * 3; ++I) {		for (unsigned I = 0; I < MFMACount * 3; ++I) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);		SchedGroupMask::DS, 2, std::nullopt, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);		SchedGroupMask::MFMA, 1, std::nullopt, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
}		}
}		}

class DemoOpt final : public IGLPStrategy {		class DemoOpt final : public IGLPStrategy {
private:		private:
public:		public:
void applyIGLPStrategy(		void applyIGLPStrategy(
Show All 14 Lines	void DemoOpt::applyIGLPStrategy(
// Count the number of MFMA instructions.		// Count the number of MFMA instructions.
unsigned MFMACount = 0;		unsigned MFMACount = 0;
for (const MachineInstr &I : *DAG)		for (const MachineInstr &I : *DAG)
if (TII->isMFMAorWMMA(I))		if (TII->isMFMAorWMMA(I))
++MFMACount;		++MFMACount;

const unsigned PipelineSyncID = 0;		const unsigned PipelineSyncID = 0;
SchedGroup *SG = nullptr;		SchedGroup *SG = nullptr;
for (unsigned I = 0; I < MFMACount * 3; ++I) {
		// The SU is a successor of SU in prev SchedGroup
		InstructionRuleType Rule1 =
		[](const SUnit SU, ArrayRef<SUnit > Collection, const SIInstrInfo *TII,
		SmallVectorImpl<SchedGroup> &SyncPipe, unsigned SGID) {
		auto MI = SU->getInstr();
		if (MI->getOpcode() == TargetOpcode::BUNDLE)
		return false;

		SchedGroup *OtherGroup = nullptr;
		for (auto &PipeSG : SyncPipe) {
		if (PipeSG.getSGID() == (int)SGID - 1) {
		OtherGroup = &PipeSG;
		}
		}

		if (!OtherGroup)
		return false;

		return (std::any_of(OtherGroup->Collection.begin(),
		OtherGroup->Collection.end(), [&SU](SUnit *Elt) {
		return std::any_of(Elt->Succs.begin(),
		Elt->Succs.end(),
		[&SU](SDep &Succ) {
		return Succ.getSUnit() == SU;
		});
		}));
		};

		SmallVector<InstructionRuleType, 4> DemoRules;
		kerbowaUnsubmitted Done Reply Inline Actions Can you add an addrule function in SchedGroup so that we don't need to do this extra copying? kerbowa: Can you add an addrule function in SchedGroup so that we don't need to do this extra copying?
		kerbowaUnsubmitted Done Reply Inline Actions DemoRules is unused now. kerbowa: DemoRules is unused now.
		DemoRules.push_back(Rule1);

		// Each iteration of pipeline has 1 MFMA and 1 DS_W, where the DS_W is a
		// successor of the MFMA
		for (unsigned I = 0; I < MFMACount; ++I) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);		SchedGroupMask::MFMA, 1, std::nullopt, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);		SchedGroupMask::DS_WRITE, 1, DemoRules, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
}		}
}		}

static std::unique_ptr<IGLPStrategy>		static std::unique_ptr<IGLPStrategy>
createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,		createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
const SIInstrInfo *TII) {		const SIInstrInfo *TII) {
switch (ID) {		switch (ID) {
case MFMASmallGemmOptID:		case MFMASmallGemmOptID:
return std::make_unique<MFMASmallGemmOpt>(DAG, TII);		return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
case DemoOptID:		case DemoOptID:
return std::make_unique<MFMASmallGemmOpt>(DAG, TII);		return std::make_unique<DemoOpt>(DAG, TII);
}		}

llvm_unreachable("Unknown IGLPStrategyID");		llvm_unreachable("Unknown IGLPStrategyID");
}		}

class IGroupLPDAGMutation : public ScheduleDAGMutation {		class IGroupLPDAGMutation : public ScheduleDAGMutation {
private:		private:
const SIInstrInfo *TII;		const SIInstrInfo *TII;
▲ Show 20 Lines • Show All 254 Lines • ▼ Show 20 Lines
void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {		void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
MachineInstr &MI = *SchedBarrier.getInstr();		MachineInstr &MI = *SchedBarrier.getInstr();
assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);		assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
// Remove all existing edges from the SCHED_BARRIER that were added due to the		// Remove all existing edges from the SCHED_BARRIER that were added due to the
// instruction having side effects.		// instruction having side effects.
resetEdges(SchedBarrier, DAG);		resetEdges(SchedBarrier, DAG);
auto InvertedMask =		auto InvertedMask =
invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());		invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);		SchedGroup SG(InvertedMask, std::nullopt, std::nullopt, DAG, TII);
SG.initSchedGroup();		SG.initSchedGroup();
// Preserve original instruction ordering relative to the SCHED_BARRIER.		// Preserve original instruction ordering relative to the SCHED_BARRIER.
SG.link(		SG.link(
SchedBarrier,		SchedBarrier,
(function_ref<bool(const SUnit A, const SUnit B)>)[](		(function_ref<bool(const SUnit A, const SUnit B)>)[](
const SUnit A, const SUnit B) { return A->NodeNum > B->NodeNum; });		const SUnit A, const SUnit B) { return A->NodeNum > B->NodeNum; });
}		}

Show All 38 Lines	void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
// to the instruction having side effects.		// to the instruction having side effects.
resetEdges(*RIter, DAG);		resetEdges(*RIter, DAG);
MachineInstr &SGB = *RIter->getInstr();		MachineInstr &SGB = *RIter->getInstr();
assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);		assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
int32_t SGMask = SGB.getOperand(0).getImm();		int32_t SGMask = SGB.getOperand(0).getImm();
int32_t Size = SGB.getOperand(1).getImm();		int32_t Size = SGB.getOperand(1).getImm();
int32_t SyncID = SGB.getOperand(2).getImm();		int32_t SyncID = SGB.getOperand(2).getImm();

auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,		auto &SG = SyncedSchedGroups[SyncID].emplace_back(
Size, SyncID, DAG, TII);		(SchedGroupMask)SGMask, Size, std::nullopt, SyncID, DAG, TII);

SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);		SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
}		}

void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {		void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
IGLPStrategyID StrategyID =		IGLPStrategyID StrategyID =
(IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();		(IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
auto S = createIGLPStrategy(StrategyID, DAG, TII);		auto S = createIGLPStrategy(StrategyID, DAG, TII);
Show All 15 Lines

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

	Show First 20 Lines • Show All 147 Lines • ▼ Show 20 Lines
	}			}


	define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {			define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
	; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:			; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:
	; GCN: ; %bb.0: ; %entry			; GCN: ; %bb.0: ; %entry
	; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24			; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
	; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0			; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
				; GCN-NEXT: v_mov_b32_e32 v2, 1.0
	; GCN-NEXT: v_mov_b32_e32 v3, 2.0			; GCN-NEXT: v_mov_b32_e32 v3, 2.0
	; GCN-NEXT: ; iglp_opt mask(0x00000001)			; GCN-NEXT: ; iglp_opt mask(0x00000001)
	; GCN-NEXT: s_waitcnt lgkmcnt(0)			; GCN-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NEXT: v_add_u32_e32 v1, s0, v0			; GCN-NEXT: v_add_u32_e32 v1, s0, v0
	; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1			; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112
	; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456			; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96
	; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440			; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80
	; GCN-NEXT: ds_read_b128 a[20:23], v2 offset:57424			; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64
	; GCN-NEXT: ds_read_b128 a[16:19], v2 offset:57408			; GCN-NEXT: ds_read_b128 a[0:3], v1
	; GCN-NEXT: ds_read_b128 a[0:3], v2 offset:57344			; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16
	; GCN-NEXT: ds_read_b128 a[4:7], v2 offset:57360			; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32
	; GCN-NEXT: ds_read_b128 a[8:11], v2 offset:57376			; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48
	; GCN-NEXT: ds_read_b128 a[12:15], v2 offset:57392
	; GCN-NEXT: v_mov_b32_e32 v2, 1.0
	; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264
	; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248
	; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:49232
	; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:49216
	; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:49200
	; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184
	; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168
	; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152
	; GCN-NEXT: s_waitcnt lgkmcnt(8)
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
	; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112
	; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96
	; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592
	; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576
	; GCN-NEXT: v_add_u32_e32 v0, s1, v0
	; GCN-NEXT: s_waitcnt lgkmcnt(4)
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
	; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80
	; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64
	; GCN-NEXT: ds_read_b128 a[128:131], v1
	; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16
	; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32
	; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48
	; GCN-NEXT: s_waitcnt lgkmcnt(0)			; GCN-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]			; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
	; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304			; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304
	; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288			; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288
	; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272			; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272
	; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256			; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256
	; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240			; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240
	; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224			; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224
	; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208			; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208
	; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192			; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192
	; GCN-NEXT: s_waitcnt lgkmcnt(0)			; GCN-NEXT: v_add_u32_e32 v0, s1, v0
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
	; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688			; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688
	; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672			; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672
	; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656			; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656
	; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640			; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640
	; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624			; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624
	; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608			; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608
	; GCN-NEXT: s_nop 2			; GCN-NEXT: s_nop 3
	; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112			; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
	; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96			; GCN-NEXT: s_waitcnt lgkmcnt(7)
	; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80			; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
	; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64			; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592
	; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48			; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576
	; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32			; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
	; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16			; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
	; GCN-NEXT: ds_write_b128 v0, a[128:131]			; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64
				; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48
				; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32
				; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16
				; GCN-NEXT: ds_write_b128 v0, a[0:3]
	; GCN-NEXT: v_mov_b32_e32 v0, s1			; GCN-NEXT: v_mov_b32_e32 v0, s1
	; GCN-NEXT: s_waitcnt lgkmcnt(8)			; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:49264
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]			; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:49248
	; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672			; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:49232
	; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688			; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:49216
	; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640			; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:49200
				; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:49184
				; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:49168
				; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:49152
				; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v1
	; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288			; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288
				; GCN-NEXT: s_waitcnt lgkmcnt(14)
				; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
				; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456
				; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440
				; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424
				; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408
				; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344
				; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360
				; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376
				; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392
	; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304			; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304
	; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256			; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256
	; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272			; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272
	; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224			; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224
	; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240			; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240
	; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192			; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192
	; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208			; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208
	; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656			; GCN-NEXT: s_nop 3
	; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608
	; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624
	; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576
	; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592
	; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864
	; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880
	; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832
	; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480			; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480
				; GCN-NEXT: s_waitcnt lgkmcnt(14)
				; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
	; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496			; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496
	; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448			; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448
	; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464			; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464
	; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416			; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416
	; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432			; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432
	; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384			; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384
	; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400			; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400
	; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848			; GCN-NEXT: s_nop 7
	; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800			; GCN-NEXT: s_nop 3
	; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816			; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672
	; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768			; GCN-NEXT: s_waitcnt lgkmcnt(14)
	; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784			; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
				; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688
				; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:24640
				; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:24656
				; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:24608
				; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:24624
				; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:24576
				; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:24592
				; GCN-NEXT: s_nop 7
				; GCN-NEXT: s_nop 3
				; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864
				; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880
				; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832
				; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848
				; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800
				; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816
				; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768
				; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784
	; GCN-NEXT: s_endpgm			; GCN-NEXT: s_endpgm
	entry:			entry:
	call void @llvm.amdgcn.iglp.opt(i32 1)			call void @llvm.amdgcn.iglp.opt(i32 1)
	%idx = call i32 @llvm.amdgcn.workitem.id.x()			%idx = call i32 @llvm.amdgcn.workitem.id.x()
	%load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx			%load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx
	%load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr			%load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr
	%load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64			%load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64
	%load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr			%load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr
	Show All 31 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][IGLP]: Add rules to SchedGroups
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 527850

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][IGLP]: Add rules to SchedGroupsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 527850

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

[AMDGPU][IGLP]: Add rules to SchedGroups
ClosedPublic