This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
lib/
-
CodeGen/
2
MachineScheduler.cpp
-
Target/AArch64/
-
AArch64/
1/1
AArch64MacroFusion.cpp
1/1
AArch64TargetMachine.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
1/1
misched-fusion-aes.ll

Differential D33230

[AArch64] Make instruction fusion more aggressive.
ClosedPublic

Authored by fhahn on May 16 2017, 12:51 AM.

Download Raw Diff

Details

Reviewers

evandro
kristof.beyls
t.p.northover
silviu.baranga
atrick
rengolin
MatzeB

Commits

rGabb4218b988f: [AArch64] Make instruction fusion more aggressive.
rL303618: [AArch64] Make instruction fusion more aggressive.

Summary

This patch makes instruction fusion more aggressive by

adding artificial edges between the successors of FirstSU and SecondSU, similar to BaseMemOpClusterMutation::clusterNeighboringMemOps.
updating PostGenericScheduler::tryCandidate to keep clusters together, similar to GenericScheduler::tryCandidate.

This change increases the number of AES instruction pairs generated on
Cortex-A57 and Cortex-A72. This doesn't change code at all in
most benchmarks or general code, but we've seen improvement on kernels
using AESE/AESMC and AESD/AESIMC.

Diff Detail

Event Timeline

fhahn created this revision.May 16 2017, 12:51 AM

Herald added subscribers: javed.absar, MatzeB, rengolin, aemerson. · View Herald TranscriptMay 16 2017, 12:51 AM

fhahn added reviewers: rengolin, MatzeB.May 16 2017, 12:52 AM

javed.absar added inline comments.May 16 2017, 2:02 AM

lib/CodeGen/MachineScheduler.cpp
3236	This seems to change core PostRA-Scheduler logic and that may impact other targets. On the other hand, I see that GenericScheduler tries ClusteredNodes candidates and this was missing from here (perhaps for no good reason).

evandro added inline comments.May 16 2017, 8:35 AM

lib/Target/AArch64/AArch64MacroFusion.cpp
243	You can write this line as: `DEBUG(dbgs() << " Copy Succ "; SI->print(dbgs(), DAG); dbgs() << '\n';);`
lib/Target/AArch64/AArch64TargetMachine.cpp
283	Sorting these methods in alphabetical order would look better.
test/CodeGen/AArch64/misched-fusion-aes.ll
1	Is `CHECKA5A72` a typo?

Addressed feedback and rebased

fhahn added inline comments.May 17 2017, 2:45 AM

lib/CodeGen/MachineScheduler.cpp
3236	Yes that change may impact other targets, but I think it makes sense to try keep clustered instruction together during PostRA scheduling as well, as GenericScheduler does.

Can you please modify misched-fusion-aes.ll with this patch?

misched-fusion-aes.patch2 KBDownload

Thanks for the feedback, I've updated with @evandro 's changes. Unfortunately I do not have access to a machine with an exynos-m1 CPU, so I can't benchmark the change on it, but it increases the number of AESE/AESMC pairs on that CPU in the test cases, so it should be positive.

Thank you.

This revision is now accepted and ready to land.May 22 2017, 8:58 AM

fhahn closed this revision.May 23 2017, 2:33 AM

Closed by commit rL303618: [AArch64] Make instruction fusion more aggressive. (authored by fhahn). · Explain WhyMay 23 2017, 2:33 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

CodeGen/

MachineScheduler.cpp

6 lines

Target/

AArch64/

AArch64MacroFusion.cpp

12 lines

AArch64TargetMachine.cpp

2 lines

test/

CodeGen/

AArch64/

misched-fusion-aes.ll

137 lines

Diff 99110

lib/CodeGen/MachineScheduler.cpp

Show First 20 Lines • Show All 3,227 Lines • ▼ Show 20 Lines	if (!Cand.isValid()) {
return;		return;
}		}

// Prioritize instructions that read unbuffered resources by stall cycles.		// Prioritize instructions that read unbuffered resources by stall cycles.
if (tryLess(Top.getLatencyStallCycles(TryCand.SU),		if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))		Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
return;		return;

		// Keep clustered nodes together.
		javed.absarUnsubmitted Not Done Reply Inline Actions This seems to change core PostRA-Scheduler logic and that may impact other targets. On the other hand, I see that GenericScheduler tries ClusteredNodes candidates and this was missing from here (perhaps for no good reason). javed.absar: This seems to change core PostRA-Scheduler logic and that may impact other targets. On the…
		fhahnAuthorUnsubmitted Not Done Reply Inline Actions Yes that change may impact other targets, but I think it makes sense to try keep clustered instruction together during PostRA scheduling as well, as GenericScheduler does. fhahn: Yes that change may impact other targets, but I think it makes sense to try keep clustered…
		if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
		Cand.SU == DAG->getNextClusterSucc(),
		TryCand, Cand, Cluster))
		return;

// Avoid critical resource consumption and balance the schedule.		// Avoid critical resource consumption and balance the schedule.
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,		if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))		TryCand, Cand, ResourceReduce))
return;		return;
if (tryGreater(TryCand.ResDelta.DemandedResources,		if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources,		Cand.ResDelta.DemandedResources,
TryCand, Cand, ResourceDemand))		TryCand, Cand, ResourceDemand))
return;		return;
▲ Show 20 Lines • Show All 366 Lines • Show Last 20 Lines

lib/Target/AArch64/AArch64MacroFusion.cpp

Show First 20 Lines • Show All 226 Lines • ▼ Show 20 Lines	for (SDep &IDep : Preds ? DepSU.Succs : DepSU.Preds)
IDep.setLatency(0);		IDep.setLatency(0);

DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";		DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";
FirstSU.print(dbgs(), DAG); dbgs() << " - ";		FirstSU.print(dbgs(), DAG); dbgs() << " - ";
SecondSU.print(dbgs(), DAG); dbgs() << " / ";		SecondSU.print(dbgs(), DAG); dbgs() << " / ";
dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " <<		dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " <<
DAG->TII->getName(SecondMI->getOpcode()) << '\n'; );		DAG->TII->getName(SecondMI->getOpcode()) << '\n'; );

		if (&SecondSU != &DAG->ExitSU)
		// Make instructions dependent on FirstSU also dependent on SecondSU to
		// prevent them from being scheduled between FirstSU and and SecondSU.
		for (SUnit::const_succ_iterator
		SI = FirstSU.Succs.begin(), SE = FirstSU.Succs.end();
		SI != SE; ++SI) {
		if (SI->getSUnit() == &SecondSU)
		continue;
		DEBUG(dbgs() << " Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n");
		evandroUnsubmitted Done Reply Inline Actions You can write this line as: `DEBUG(dbgs() << " Copy Succ "; SI->print(dbgs(), DAG); dbgs() << '\n';);` evandro: You can write this line as: `DEBUG(dbgs() << " Copy Succ "; SI->print(dbgs(), DAG); dbgs() <<…
		DAG->addEdge(SI->getSUnit(), SDep(&SecondSU, SDep::Artificial));
		}

++NumFused;		++NumFused;
return true;		return true;
}		}

return false;		return false;
}		}

/// \brief Post-process the DAG to create cluster edges between instrs that may		/// \brief Post-process the DAG to create cluster edges between instrs that may
Show All 30 Lines

lib/Target/AArch64/AArch64TargetMachine.cpp

Show First 20 Lines • Show All 274 Lines • ▼ Show 20 Lines	createMachineScheduler(MachineSchedContext *C) const override {
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));		DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAArch64MacroFusionDAGMutation());		DAG->addMutation(createAArch64MacroFusionDAGMutation());
return DAG;		return DAG;
}		}

ScheduleDAGInstrs *		ScheduleDAGInstrs *
createPostMachineScheduler(MachineSchedContext *C) const override {		createPostMachineScheduler(MachineSchedContext *C) const override {
const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();		const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();
if (ST.hasFuseLiterals()) {		if (ST.hasFuseLiterals() \|\| ST.hasFuseAES()) {
		evandroUnsubmitted Done Reply Inline Actions Sorting these methods in alphabetical order would look better. evandro: Sorting these methods in alphabetical order would look better.
// Run the Macro Fusion after RA again since literals are expanded from		// Run the Macro Fusion after RA again since literals are expanded from
// pseudos then (v. addPreSched2()).		// pseudos then (v. addPreSched2()).
ScheduleDAGMI *DAG = createGenericSchedPostRA(C);		ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
DAG->addMutation(createAArch64MacroFusionDAGMutation());		DAG->addMutation(createAArch64MacroFusionDAGMutation());
return DAG;		return DAG;
}		}

return nullptr;		return nullptr;
▲ Show 20 Lines • Show All 187 Lines • Show Last 20 Lines

test/CodeGen/AArch64/misched-fusion-aes.ll

; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 \| FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57		; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 \| FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA5A72
		evandroUnsubmitted Done Reply Inline Actions Is `CHECKA5A72` a typo? evandro: Is `CHECKA5A72` a typo?
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 \| FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA72		; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 \| FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57A72
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1 \| FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1		; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1 \| FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1

declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k)		declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k)
declare <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %d)		declare <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %d)
declare <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %d, <16 x i8> %k)		declare <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %d, <16 x i8> %k)
declare <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %d)		declare <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %d)

define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) {		define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) {
▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) {
store <16 x i8> %h1, <16 x i8>* %c1		store <16 x i8> %h1, <16 x i8>* %c1
%c2 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 2		%c2 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 2
store <16 x i8> %h2, <16 x i8>* %c2		store <16 x i8> %h2, <16 x i8>* %c2
%c3 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 3		%c3 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 3
store <16 x i8> %h3, <16 x i8>* %c3		store <16 x i8> %h3, <16 x i8>* %c3
ret void		ret void

; CHECK-LABEL: aesea:		; CHECK-LABEL: aesea:
; CHECKA57: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
; CHECKA57: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VC]]		; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
; CHECKA57: aesmc {{v[0-7].16b}}, [[VA]]		; CHECKA57A72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VD]]		; CHECKA57A72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57: aesmc {{v[0-7].16b}}, [[VB]]		; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
; CHECKA57: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VE]]		; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
; CHECKA57: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VF]]		; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
; CHECKA57: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VG]]		; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
; CHECKA57: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VH]]		; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
; CHECKA72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
; CHECKA72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
; CHECKA72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
; CHECKA72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
; CHECKA72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
; CHECKA72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
; CHECKA72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
; CHECKA72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1: aesmc {{v[0-7].16b}}, [[VA]]		; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
		; CHECKM1: aese {{v[0-7].16b}}, {{v[0-7].16b}}
; CHECKM1: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VB]]		; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
; CHECKM1: aese {{v[0-7].16b}}, {{v[0-7].16b}}		; CHECKM1: aese {{v[0-7].16b}}, {{v[0-7].16b}}
; CHECKM1: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VC]]		; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
; CHECKM1: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1: aesmc {{v[0-7].16b}}, [[VD]]		; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
; CHECKM1: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VE]]		; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
; CHECKM1: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VF]]		; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
; CHECKM1: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VG]]		; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
; CHECKM1: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
}		}

define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) {		define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) {
%d0 = load <16 x i8>, <16 x i8>* %a0		%d0 = load <16 x i8>, <16 x i8>* %a0
%a1 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 1		%a1 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 1
%d1 = load <16 x i8>, <16 x i8>* %a1		%d1 = load <16 x i8>, <16 x i8>* %a1
%a2 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 2		%a2 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 2
%d2 = load <16 x i8>, <16 x i8>* %a2		%d2 = load <16 x i8>, <16 x i8>* %a2
▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines	define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) {
store <16 x i8> %h1, <16 x i8>* %c1		store <16 x i8> %h1, <16 x i8>* %c1
%c2 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 2		%c2 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 2
store <16 x i8> %h2, <16 x i8>* %c2		store <16 x i8> %h2, <16 x i8>* %c2
%c3 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 3		%c3 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 3
store <16 x i8> %h3, <16 x i8>* %c3		store <16 x i8> %h3, <16 x i8>* %c3
ret void		ret void

; CHECK-LABEL: aesda:		; CHECK-LABEL: aesda:
; CHECKA57: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
; CHECKA57: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VC]]		; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
; CHECKA57: aesimc {{v[0-7].16b}}, [[VA]]		; CHECKA57A72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VD]]		; CHECKA57A72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57: aesimc {{v[0-7].16b}}, [[VB]]		; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
; CHECKA57: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VE]]		; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
; CHECKA57: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VF]]		; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
; CHECKA57: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VG]]		; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
; CHECKA57: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}		; CHECKA57A72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VH]]		; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
; CHECKA72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
; CHECKA72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
; CHECKA72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
; CHECKA72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
; CHECKA72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
; CHECKA72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
; CHECKA72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
; CHECKA72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1: aesimc {{v[0-7].16b}}, [[VA]]		; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
		; CHECKM1: aesd {{v[0-7].16b}}, {{v[0-7].16b}}
; CHECKM1: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VB]]		; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
; CHECKM1: aesd {{v[0-7].16b}}, {{v[0-7].16b}}		; CHECKM1: aesd {{v[0-7].16b}}, {{v[0-7].16b}}
; CHECKM1: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VC]]		; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
; CHECKM1: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1: aesimc {{v[0-7].16b}}, [[VD]]		; CHECKM1: aesimc {{v[0-7].16b}}, [[VD]]
; CHECKM1: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VE]]		; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
; CHECKM1: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VF]]		; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
; CHECKM1: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VG]]		; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
; CHECKM1: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}		; CHECKM1: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VH]]		}

		define void @aes_load_store(<16 x i8> %p1, <16 x i8> %p2 , <16 x i8> *%p3) {
		entry:
		%x1 = alloca <16 x i8>, align 16
		%x2 = alloca <16 x i8>, align 16
		%x3 = alloca <16 x i8>, align 16
		%x4 = alloca <16 x i8>, align 16
		%x5 = alloca <16 x i8>, align 16
		%in1 = load <16 x i8>, <16 x i8>* %p1, align 16
		store <16 x i8> %in1, <16 x i8>* %x1, align 16
		%aese1 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %in1, <16 x i8> %in1) #2
		store <16 x i8> %aese1, <16 x i8>* %x2, align 16
		%in2 = load <16 x i8>, <16 x i8>* %p2, align 16
		%aesmc1= call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %aese1) #2
		store <16 x i8> %aesmc1, <16 x i8>* %x3, align 16
		%aese2 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %in1, <16 x i8> %in2) #2
		store <16 x i8> %aese2, <16 x i8>* %x4, align 16
		%aesmc2= call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %aese2) #2
		store <16 x i8> %aesmc2, <16 x i8>* %x5, align 16
		ret void

		; CHECK-LABEL: aes_load_store:
		; CHECK: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
		; CHECK-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
		; CHECK: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
		; CHECK-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
}		}