This is an archive of the discontinued LLVM Phabricator instance.

Differential D19885

[AArch64] Decouple zero store promotion from narrow ld merge. NFC.
ClosedPublic

Authored by junbuml on May 3 2016, 12:52 PM.

Download Raw Diff

Details

Reviewers

t.p.northover
jmolloy
mcrosier

Commits

rG33be4997edad: [AArch64] Decouple zero store promotion from narrow ld merge. NFC.
rL268744: [AArch64] Decouple zero store promotion from narrow ld merge. NFC.

Summary

This change refactors to decouple the zero store promotion from the narrow ld merge and add a flag (enable-narrow-ld-merge=true) to control the narrow ld merge optimization.

Diff Detail

Repository: rL LLVM

Event Timeline

junbuml updated this revision to Diff 56048.May 3 2016, 12:52 PM

junbuml retitled this revision from to [AArch64] Decouple zero store promotion from narrow ld merge. NFC..

junbuml updated this object.

junbuml added reviewers: mcrosier, jmolloy, t.p.northover.

junbuml added a subscriber: llvm-commits.

Herald added subscribers: mcrosier, rengolin, aemerson. · View Herald TranscriptMay 3 2016, 12:52 PM

In our internal tests, we found performance regressions with the narrow load merge in some cases. Initially, this optimization was driven by the +3% performance gain in spec2006/h264ref that has a load intensive hot loop. However, the gain I was targeting in h264ref is now completely covered by SLP vectorizer.

As this optimization converts two loads into one load with two shift instructions, it could potentially hurt performance if a loop is arithmetic operation intensive.

Through this change I want to let other people run performance test with/without the narrow load merge. If there is no objection I would like to disable the narrow load merge by default in separate patch.

LGTM.

This revision is now accepted and ready to land.May 3 2016, 1:37 PM

Closed by commit rL268744: [AArch64] Decouple zero store promotion from narrow ld merge. NFC. (authored by junbuml). · Explain WhyMay 6 2016, 8:14 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

AArch64/

AArch64LoadStoreOptimizer.cpp

44 lines

Diff 56416

llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

Show First 20 Lines • Show All 45 Lines • ▼ Show 20 Lines
static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",		static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
cl::init(20), cl::Hidden);		cl::init(20), cl::Hidden);

// The UpdateLimit limits how far we search for update instructions when we form		// The UpdateLimit limits how far we search for update instructions when we form
// pre-/post-index instructions.		// pre-/post-index instructions.
static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),		static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
cl::Hidden);		cl::Hidden);

		static cl::opt<bool> EnableNarrowLdMerge("enable-narrow-ld-merge", cl::Hidden,
		cl::init(true),
		cl::desc("Enable narrow load merge"));

namespace llvm {		namespace llvm {
void initializeAArch64LoadStoreOptPass(PassRegistry &);		void initializeAArch64LoadStoreOptPass(PassRegistry &);
}		}

#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"		#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"

namespace {		namespace {

▲ Show 20 Lines • Show All 547 Lines • ▼ Show 20 Lines	int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
: getLdStOffsetOp(StoreInst).getImm() * StoreSize;		: getLdStOffsetOp(StoreInst).getImm() * StoreSize;
int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)		int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
? getLdStOffsetOp(LoadInst).getImm()		? getLdStOffsetOp(LoadInst).getImm()
: getLdStOffsetOp(LoadInst).getImm() * LoadSize;		: getLdStOffsetOp(LoadInst).getImm() * LoadSize;
return (UnscaledStOffset <= UnscaledLdOffset) &&		return (UnscaledStOffset <= UnscaledLdOffset) &&
(UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));		(UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
}		}

static bool isPromotableZeroStoreOpcode(MachineInstr *MI) {		static bool isPromotableZeroStoreOpcode(unsigned Opc) {
unsigned Opc = MI->getOpcode();
return isNarrowStore(Opc) \|\| Opc == AArch64::STRWui \|\| Opc == AArch64::STURWi;		return isNarrowStore(Opc) \|\| Opc == AArch64::STRWui \|\| Opc == AArch64::STURWi;
}		}

		static bool isPromotableZeroStoreOpcode(MachineInstr *MI) {
		return isPromotableZeroStoreOpcode(MI->getOpcode());
		}

static bool isPromotableZeroStoreInst(MachineInstr *MI) {		static bool isPromotableZeroStoreInst(MachineInstr *MI) {
return (isPromotableZeroStoreOpcode(MI)) &&		return (isPromotableZeroStoreOpcode(MI)) &&
getLdStRegOp(MI).getReg() == AArch64::WZR;		getLdStRegOp(MI).getReg() == AArch64::WZR;
}		}

MachineBasicBlock::iterator		MachineBasicBlock::iterator
AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I,		AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator MergeMI,		MachineBasicBlock::iterator MergeMI,
▲ Show 20 Lines • Show All 1,087 Lines • ▼ Show 20 Lines	bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
// e.g.,		// e.g.,
// strh wzr, [x0]		// strh wzr, [x0]
// strh wzr, [x0, #2]		// strh wzr, [x0, #2]
// ; becomes		// ; becomes
// str wzr, [x0]		// str wzr, [x0]
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();		for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
enableNarrowLdOpt && MBBI != E;) {		enableNarrowLdOpt && MBBI != E;) {
MachineInstr *MI = MBBI;		MachineInstr *MI = MBBI;
switch (MI->getOpcode()) {		unsigned Opc = MI->getOpcode();
default:		if (isPromotableZeroStoreOpcode(Opc) \|\|
// Just move on to the next instruction.		(EnableNarrowLdMerge && isNarrowLoad(Opc))) {
++MBBI;
break;
// Scaled instructions.
case AArch64::LDRBBui:
case AArch64::LDRHHui:
case AArch64::LDRSBWui:
case AArch64::LDRSHWui:
case AArch64::STRBBui:
case AArch64::STRHHui:
case AArch64::STRWui:
// Unscaled instructions.
case AArch64::LDURBBi:
case AArch64::LDURHHi:
case AArch64::LDURSBWi:
case AArch64::LDURSHWi:
case AArch64::STURBBi:
case AArch64::STURHHi:
case AArch64::STURWi: {
if (tryToMergeLdStInst(MBBI)) {		if (tryToMergeLdStInst(MBBI)) {
Modified = true;		Modified = true;
break;		} else
}		++MBBI;
		} else
++MBBI;		++MBBI;
break;
}
}
}		}

// 3) Find loads and stores that can be merged into a single load or store		// 3) Find loads and stores that can be merged into a single load or store
// pair instruction.		// pair instruction.
// e.g.,		// e.g.,
// ldr x0, [x2]		// ldr x0, [x2]
// ldr x1, [x2, #8]		// ldr x1, [x2, #8]
// ; becomes		// ; becomes
// ldp x0, x1, [x2]		// ldp x0, x1, [x2]
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();		for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
▲ Show 20 Lines • Show All 208 Lines • Show Last 20 Lines