This is an archive of the discontinued LLVM Phabricator instance.

[AArch64]Enable the narrow ld promotion only on profitable microarchitectures
ClosedPublic

Authored by junbuml on Nov 2 2015, 9:12 AM.

Download Raw Diff

Details

Reviewers

mzolotukhin
ab
jmolloy
mcrosier

Summary

The benefit from converting narrow loads into a wider loads (r251438) could be
micro-architecturally dependent, as it assumes that a single load with two bitfield
extracts is cheaper than two narrow loads. Currently, this conversion is enabled
only in cortex-a57 on which performance benefits were verified.

Diff Detail

Event Timeline

junbuml updated this revision to Diff 38936.Nov 2 2015, 9:12 AM

junbuml retitled this revision from to [AArch64]Enable the narrow ld promotion only on profitable microarchitectures.

junbuml updated this object.

junbuml added reviewers: jmolloy, mcrosier, ab, mzolotukhin.

junbuml added a subscriber: llvm-commits.

Herald added subscribers: rengolin, aemerson. · View Herald TranscriptNov 2 2015, 9:12 AM

mcrosier added inline comments.Nov 2 2015, 9:29 AM

lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
1196	Why not call enableNarrowLdMerge(MBB->getParent()) here? You would need to change the logic in enableNarrowLdMerge since you're passing in a pointer to the MF, but that should be trivial.
1379	How about enableNarrowLdMerge, rather than couldNarrowLdMergeEnabled?
1395	I'd still prefer we sink this check into the optimizeBlock() function.

Thanks Chad for the review. I addressed your comments.

LGTM. Thanks, Jun.

lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
1395	Jun and I discussed this offline. We shouldn't sink this because we don't need to call this predicate for every function. The answer will always be the same, so we only call it once in the runOnFunction function.

This revision is now accepted and ready to land.Nov 2 2015, 11:11 AM

junbuml mentioned this in D14183: [AArch64]Extend merging narrow loads into a wider load.Nov 3 2015, 1:20 PM

junbuml added a child revision: D14183: [AArch64]Extend merging narrow loads into a wider load.

committed in r252316.

Revision Contents

Path

Size

lib/

Target/

AArch64/

AArch64LoadStoreOptimizer.cpp

30 lines

test/

CodeGen/

AArch64/

arm64-ldp.ll

48 lines

arm64-ldr-merge.ll

47 lines

Diff 38944

lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

Show First 20 Lines • Show All 72 Lines • ▼ Show 20 Lines	typedef struct LdStPairFlags {

void setSExtIdx(int V) { SExtIdx = V; }		void setSExtIdx(int V) { SExtIdx = V; }
int getSExtIdx() const { return SExtIdx; }		int getSExtIdx() const { return SExtIdx; }

} LdStPairFlags;		} LdStPairFlags;

struct AArch64LoadStoreOpt : public MachineFunctionPass {		struct AArch64LoadStoreOpt : public MachineFunctionPass {
static char ID;		static char ID;
AArch64LoadStoreOpt() : MachineFunctionPass(ID), IsStrictAlign(false) {		AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());		initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
}		}

const AArch64InstrInfo *TII;		const AArch64InstrInfo *TII;
const TargetRegisterInfo *TRI;		const TargetRegisterInfo *TRI;
bool IsStrictAlign;

// Scan the instructions looking for a load/store that can be combined		// Scan the instructions looking for a load/store that can be combined
// with the current instruction into a load/store pair.		// with the current instruction into a load/store pair.
// Return the matching instruction if one is found, else MBB->end().		// Return the matching instruction if one is found, else MBB->end().
MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,		MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
LdStPairFlags &Flags,		LdStPairFlags &Flags,
unsigned Limit);		unsigned Limit);
// Merge the two instructions indicated into a single pair-wise instruction.		// Merge the two instructions indicated into a single pair-wise instruction.
Show All 26 Lines	struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Merge a pre- or post-index base register update into a ld/st instruction.		// Merge a pre- or post-index base register update into a ld/st instruction.
MachineBasicBlock::iterator		MachineBasicBlock::iterator
mergeUpdateInsn(MachineBasicBlock::iterator I,		mergeUpdateInsn(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Update, bool IsPreIdx);		MachineBasicBlock::iterator Update, bool IsPreIdx);

// Find and merge foldable ldr/str instructions.		// Find and merge foldable ldr/str instructions.
bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);		bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);

bool optimizeBlock(MachineBasicBlock &MBB);		// Check if converting two narrow loads into a single wider load with
		// bitfield extracts could be enabled.
		bool enableNarrowLdMerge(MachineFunction &Fn);

		bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);

bool runOnMachineFunction(MachineFunction &Fn) override;		bool runOnMachineFunction(MachineFunction &Fn) override;

const char *getPassName() const override {		const char *getPassName() const override {
return AARCH64_LOAD_STORE_OPT_NAME;		return AARCH64_LOAD_STORE_OPT_NAME;
}		}
};		};
char AArch64LoadStoreOpt::ID = 0;		char AArch64LoadStoreOpt::ID = 0;
▲ Show 20 Lines • Show All 1,017 Lines • ▼ Show 20 Lines	if (Paired != E) {
// pain, so we let the merge routine tell us what the next instruction		// pain, so we let the merge routine tell us what the next instruction
// is after it's done mucking about.		// is after it's done mucking about.
MBBI = mergePairedInsns(MBBI, Paired, Flags);		MBBI = mergePairedInsns(MBBI, Paired, Flags);
return true;		return true;
}		}
return false;		return false;
}		}

bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {		bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
		bool enableNarrowLdOpt) {
bool Modified = false;		bool Modified = false;
// Three tranformations to do here:		// Three tranformations to do here:
// 1) Find halfword loads that can be merged into a single 32-bit word load		// 1) Find halfword loads that can be merged into a single 32-bit word load
// with bitfield extract instructions.		// with bitfield extract instructions.
// e.g.,		// e.g.,
// ldrh w0, [x2]		// ldrh w0, [x2]
// ldrh w1, [x2, #2]		// ldrh w1, [x2, #2]
// ; becomes		// ; becomes
Show All 11 Lines	bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
// as a base-reg writeback.		// as a base-reg writeback.
// e.g.,		// e.g.,
// ldr x0, [x2]		// ldr x0, [x2]
// add x2, x2, #4		// add x2, x2, #4
// ; becomes		// ; becomes
// ldr x0, [x2], #4		// ldr x0, [x2], #4

for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();		for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
!IsStrictAlign && MBBI != E;) {		enableNarrowLdOpt && MBBI != E;) {
		mcrosierUnsubmitted Not Done Reply Inline Actions Why not call enableNarrowLdMerge(MBB->getParent()) here? You would need to change the logic in enableNarrowLdMerge since you're passing in a pointer to the MF, but that should be trivial. mcrosier: Why not call enableNarrowLdMerge(MBB->getParent()) here? You would need to change the logic in…
MachineInstr *MI = MBBI;		MachineInstr *MI = MBBI;
switch (MI->getOpcode()) {		switch (MI->getOpcode()) {
default:		default:
// Just move on to the next instruction.		// Just move on to the next instruction.
++MBBI;		++MBBI;
break;		break;
// Scaled instructions.		// Scaled instructions.
case AArch64::LDRHHui:		case AArch64::LDRHHui:
▲ Show 20 Lines • Show All 166 Lines • ▼ Show 20 Lines	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
}		}
// FIXME: Do the other instructions.		// FIXME: Do the other instructions.
}		}
}		}

return Modified;		return Modified;
}		}

		bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
		mcrosierUnsubmitted Not Done Reply Inline Actions How about enableNarrowLdMerge, rather than couldNarrowLdMergeEnabled? mcrosier: How about enableNarrowLdMerge, rather than couldNarrowLdMergeEnabled?
		const AArch64Subtarget *SubTarget =
		&static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
		bool ProfitableArch = SubTarget->isCortexA57();
		// FIXME: The benefit from converting narrow loads into a wider loads could be
		// microarchitectural as it assumes that a single load with two bitfield
		// extracts is cheaper than two narrow loads. Currently, this conversion is
		// enabled only in cortex-a57 on which performance benefits were verified.
		return ProfitableArch & (!SubTarget->requiresStrictAlign());
		}

bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {		bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo());		TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo());
TRI = Fn.getSubtarget().getRegisterInfo();		TRI = Fn.getSubtarget().getRegisterInfo();
IsStrictAlign = (static_cast<const AArch64Subtarget &>(Fn.getSubtarget()))
.requiresStrictAlign();

bool Modified = false;		bool Modified = false;
		bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
		mcrosierUnsubmitted Not Done Reply Inline Actions I'd still prefer we sink this check into the optimizeBlock() function. mcrosier: I'd still prefer we sink this check into the optimizeBlock() function.
		mcrosierUnsubmitted Not Done Reply Inline Actions Jun and I discussed this offline. We shouldn't sink this because we don't need to call this predicate for every function. The answer will always be the same, so we only call it once in the runOnFunction function. mcrosier: Jun and I discussed this offline. We shouldn't sink this because we don't need to call this…
for (auto &MBB : Fn)		for (auto &MBB : Fn)
Modified \|= optimizeBlock(MBB);		Modified \|= optimizeBlock(MBB, enableNarrowLdOpt);

return Modified;		return Modified;
}		}

// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep		// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
// loads and stores near one another?		// loads and stores near one another?

/// createAArch64LoadStoreOptimizationPass - returns an instance of the		/// createAArch64LoadStoreOptimizationPass - returns an instance of the
/// load / store optimization pass.		/// load / store optimization pass.
FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {		FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
return new AArch64LoadStoreOpt();		return new AArch64LoadStoreOpt();
}		}

test/CodeGen/AArch64/arm64-ldp.ll

Show First 20 Lines • Show All 350 Lines • ▼ Show 20 Lines	define i64 @ldp_sext_int_post(i32* %p) nounwind {
%sexttmp = sext i32 %tmp to i64		%sexttmp = sext i32 %tmp to i64
%sexttmp1 = sext i32 %tmp1 to i64		%sexttmp1 = sext i32 %tmp1 to i64
%ptr = getelementptr inbounds i32, i32* %add.ptr, i64 1		%ptr = getelementptr inbounds i32, i32* %add.ptr, i64 1
call void @use-ptr(i32* %ptr)		call void @use-ptr(i32* %ptr)
%add = add nsw i64 %sexttmp1, %sexttmp		%add = add nsw i64 %sexttmp1, %sexttmp
ret i64 %add		ret i64 %add
}		}

; CHECK-LABEL: Ldrh_merge
; CHECK-NOT: ldrh
; CHECK: ldr [[NEW_DEST:w[0-9]+]]
; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff
; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]]

define i16 @Ldrh_merge(i16* nocapture readonly %p) {
%1 = load i16, i16* %p, align 2
;%conv = zext i16 %0 to i32
%arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1
%2 = load i16, i16* %arrayidx2, align 2
%add = add nuw nsw i16 %1, %2
ret i16 %add
}

; CHECK-LABEL: Ldurh_merge
; CHECK-NOT: ldurh
; CHECK: ldur [[NEW_DEST:w[0-9]+]]
; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff
; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]]
define i16 @Ldurh_merge(i16* nocapture readonly %p) {
entry:
%arrayidx = getelementptr inbounds i16, i16* %p, i64 -2
%0 = load i16, i16* %arrayidx
%arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1
%1 = load i16, i16* %arrayidx3
%add = add nuw nsw i16 %0, %1
ret i16 %add
}

; CHECK-LABEL: Ldrh_4_merge
; CHECK-NOT: ldrh
; CHECK: ldp [[NEW_DEST:w[0-9]+]]
define i16 @Ldrh_4_merge(i16* nocapture readonly %P) {
%arrayidx = getelementptr inbounds i16, i16* %P, i64 0
%l0 = load i16, i16* %arrayidx
%arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1
%l1 = load i16, i16* %arrayidx2
%arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2
%l2 = load i16, i16* %arrayidx7
%arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3
%l3 = load i16, i16* %arrayidx12
%add4 = add nuw nsw i16 %l1, %l0
%add9 = add nuw nsw i16 %add4, %l2
%add14 = add nuw nsw i16 %add9, %l3

ret i16 %add14
}

test/CodeGen/AArch64/arm64-ldr-merge.ll

This file was added.

				; RUN: llc < %s -march=arm64 -mcpu=cortex-a57 -verify-machineinstrs \| FileCheck %s

				; CHECK-LABEL: Ldrh_merge
				; CHECK-NOT: ldrh
				; CHECK: ldr [[NEW_DEST:w[0-9]+]]
				; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff
				; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]]
				define i16 @Ldrh_merge(i16* nocapture readonly %p) {
				%1 = load i16, i16* %p, align 2
				%arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1
				%2 = load i16, i16* %arrayidx2, align 2
				%add = add nuw nsw i16 %1, %2
				ret i16 %add
				}

				; CHECK-LABEL: Ldurh_merge
				; CHECK-NOT: ldurh
				; CHECK: ldur [[NEW_DEST:w[0-9]+]]
				; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff
				; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]]
				define i16 @Ldurh_merge(i16* nocapture readonly %p) {
				entry:
				%arrayidx = getelementptr inbounds i16, i16* %p, i64 -2
				%0 = load i16, i16* %arrayidx
				%arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1
				%1 = load i16, i16* %arrayidx3
				%add = add nuw nsw i16 %0, %1
				ret i16 %add
				}

				; CHECK-LABEL: Ldrh_4_merge
				; CHECK-NOT: ldrh
				; CHECK: ldp [[NEW_DEST:w[0-9]+]]
				define i16 @Ldrh_4_merge(i16* nocapture readonly %P) {
				%arrayidx = getelementptr inbounds i16, i16* %P, i64 0
				%l0 = load i16, i16* %arrayidx
				%arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1
				%l1 = load i16, i16* %arrayidx2
				%arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2
				%l2 = load i16, i16* %arrayidx7
				%arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3
				%l3 = load i16, i16* %arrayidx12
				%add4 = add nuw nsw i16 %l1, %l0
				%add9 = add nuw nsw i16 %add4, %l2
				%add14 = add nuw nsw i16 %add9, %l3
				ret i16 %add14
				}