This is an archive of the discontinued LLVM Phabricator instance.

[Scheduling] Improve group algorithm for store cluster
ClosedPublic

Authored by steven.zhang on Jul 20 2020, 12:23 AM.

Download Raw Diff

Details

Reviewers

fhahn
evandro
arsenm
jsji
kbarton
rampitec

Group Reviewers

Restricted Project

Commits

rGa6e9f5264c85: [Scheduling] Improve group algorithm for store cluster

Summary

The scheduler will try to classify the MemOps into different groups and then clustering neighboring MemOps within each group. The current algorithm is to have the MemOps with the same ctrl(non-data) dep into the same group. That works fine for load but not well for store as store might have two memory dep.

See this example: Store Addr and Store Addr+8 are clusterable pair. They have memory(ctrl) dependency on different loads. Current implementation will put these two stores into different group and miss to cluster them.

Load X               Load Y
  ^                    ^
  |                    |
  |mem                 |mem
  |                    |
  +                    +
Store Addr           Store Addr+8
  ^                    ^
  +--------------------+
         cluster

This will affect the case like this.

void foo(long long *restrict a, long long *restrict b, long long *restrict c, int n) {
        for (int i =0; i<n;i++)
            a[i] += b[i]*c[i];
}

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

steven.zhang created this revision.Jul 20 2020, 12:23 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 20 2020, 12:23 AM

Herald added subscribers: • wuzish, javed.absar, hiraditya and 2 others. · View Herald Transcript

Harbormaster completed remote builds in B64873: Diff 279123.Jul 20 2020, 1:13 AM

steven.zhang edited the summary of this revision. (Show Details)Jul 20 2020, 2:34 AM

Other than that, it seems sensible.

llvm/lib/CodeGen/MachineScheduler.cpp
1663	It doesn't seem to me that the condition needs to be cached in a variable.

Address comments.

Harbormaster completed remote builds in B65347: Diff 280055.Jul 23 2020, 3:35 AM

evandro accepted this revision.Jul 24 2020, 11:48 AM

This revision is now accepted and ready to land.Jul 24 2020, 11:48 AM

Closed by commit rGa6e9f5264c85: [Scheduling] Improve group algorithm for store cluster (authored by steven.zhang). · Explain WhyJul 26 2020, 7:05 PM

This revision was automatically updated to reflect the committed changes.

steven.zhang mentioned this in D85517: [Scheduling] Implement a new way to cluster loads/stores.Aug 7 2020, 4:57 AM

foad added a subscriber: foad.Aug 7 2020, 7:10 AM

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

MachineScheduler.cpp

8 lines

test/

CodeGen/

AArch64/

aarch64-stp-cluster.ll

47 lines

Diff 280768

llvm/lib/CodeGen/MachineScheduler.cpp

Show First 20 Lines • Show All 1,647 Lines • ▼ Show 20 Lines	void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
DenseMap<unsigned, SmallVector<SUnit *, 4>> StoreChains;		DenseMap<unsigned, SmallVector<SUnit *, 4>> StoreChains;
for (SUnit &SU : DAG->SUnits) {		for (SUnit &SU : DAG->SUnits) {
if ((IsLoad && !SU.getInstr()->mayLoad()) \|\|		if ((IsLoad && !SU.getInstr()->mayLoad()) \|\|
(!IsLoad && !SU.getInstr()->mayStore()))		(!IsLoad && !SU.getInstr()->mayStore()))
continue;		continue;

unsigned ChainPredID = DAG->SUnits.size();		unsigned ChainPredID = DAG->SUnits.size();
for (const SDep &Pred : SU.Preds) {		for (const SDep &Pred : SU.Preds) {
if (Pred.isCtrl() && !Pred.isArtificial()) {		// We only want to cluster the mem ops that have the same ctrl(non-data)
		// pred so that they didn't have ctrl dependency for each other. But for
		// store instrs, we can still cluster them if the pred is load instr.
		if ((Pred.isCtrl() &&
		(IsLoad \|\|
		(Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) &&
		!Pred.isArtificial()) {
ChainPredID = Pred.getSUnit()->NodeNum;		ChainPredID = Pred.getSUnit()->NodeNum;
		evandroUnsubmitted Not Done Reply Inline Actions It doesn't seem to me that the condition needs to be cached in a variable. evandro: It doesn't seem to me that the condition needs to be cached in a variable.
break;		break;
}		}
}		}
// Insert the SU to corresponding store chain.		// Insert the SU to corresponding store chain.
auto &Chain = StoreChains.FindAndConstruct(ChainPredID).second;		auto &Chain = StoreChains.FindAndConstruct(ChainPredID).second;
Chain.push_back(&SU);		Chain.push_back(&SU);
}		}

▲ Show 20 Lines • Show All 2,148 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll

Show First 20 Lines • Show All 141 Lines • ▼ Show 20 Lines	entry:
store volatile i64 %v, i64* %arrayidx1		store volatile i64 %v, i64* %arrayidx1
%arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1		%arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1
store volatile i64 %v, i64* %arrayidx2		store volatile i64 %v, i64* %arrayidx2
%arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4		%arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4
store volatile i64 %v, i64* %arrayidx3		store volatile i64 %v, i64* %arrayidx3
ret i64 %v		ret i64 %v
}		}

		; CHECK: ******** MI Scheduling ********
		; CHECK-LABEL: stp_i64_with_ld:%bb.0
		; CHECK:Cluster ld/st SU(5) - SU(10)
		; CHECK:Cluster ld/st SU(15) - SU(20)
		; CHECK:SU(5): STRXui %7:gpr64, %0:gpr64common, 0 ::
		; CHECK:SU(10): STRXui %12:gpr64, %0:gpr64common, 1 ::
		; CHECK:SU(15): STRXui %17:gpr64, %0:gpr64common, 2 ::
		; CHECK:SU(20): STRXui %22:gpr64, %0:gpr64common, 3 ::
		define void @stp_i64_with_ld(i64* noalias nocapture %a, i64* noalias nocapture readnone %b, i64* noalias nocapture readnone %c) {
		entry:
		%arrayidx = getelementptr inbounds i64, i64* %a, i64 8
		%0 = load i64, i64* %arrayidx, align 8
		%arrayidx3 = getelementptr inbounds i64, i64* %a, i64 16
		%1 = load i64, i64* %arrayidx3, align 8
		%mul = mul nsw i64 %1, %0
		%2 = load i64, i64* %a, align 8
		%add6 = add nsw i64 %2, %mul
		store i64 %add6, i64* %a, align 8
		%arrayidx.1 = getelementptr inbounds i64, i64* %a, i64 9
		%3 = load i64, i64* %arrayidx.1, align 8
		%arrayidx3.1 = getelementptr inbounds i64, i64* %a, i64 17
		%4 = load i64, i64* %arrayidx3.1, align 8
		%mul.1 = mul nsw i64 %4, %3
		%arrayidx5.1 = getelementptr inbounds i64, i64* %a, i64 1
		%5 = load i64, i64* %arrayidx5.1, align 8
		%add6.1 = add nsw i64 %5, %mul.1
		store i64 %add6.1, i64* %arrayidx5.1, align 8
		%arrayidx.2 = getelementptr inbounds i64, i64* %a, i64 10
		%6 = load i64, i64* %arrayidx.2, align 8
		%arrayidx3.2 = getelementptr inbounds i64, i64* %a, i64 18
		%7 = load i64, i64* %arrayidx3.2, align 8
		%mul.2 = mul nsw i64 %7, %6
		%arrayidx5.2 = getelementptr inbounds i64, i64* %a, i64 2
		%8 = load i64, i64* %arrayidx5.2, align 8
		%add6.2 = add nsw i64 %8, %mul.2
		store i64 %add6.2, i64* %arrayidx5.2, align 8
		%arrayidx.3 = getelementptr inbounds i64, i64* %a, i64 11
		%9 = load i64, i64* %arrayidx.3, align 8
		%arrayidx3.3 = getelementptr inbounds i64, i64* %a, i64 19
		%10 = load i64, i64* %arrayidx3.3, align 8
		%mul.3 = mul nsw i64 %10, %9
		%arrayidx5.3 = getelementptr inbounds i64, i64* %a, i64 3
		%11 = load i64, i64* %arrayidx5.3, align 8
		%add6.3 = add nsw i64 %11, %mul.3
		store i64 %add6.3, i64* %arrayidx5.3, align 8
		ret void
		}