This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Fix halfword load merging for big-endian targets
ClosedPublic

Authored by olista01 on Nov 10 2015, 2:36 AM.

Download Raw Diff

Details

Reviewers

rengolin
junbuml
mcrosier

Summary

For big-endian targets, when we merge two halfword loads into a word load, the order of the halfwords in the loaded value is reversed compared to little-endian, so the load-store optimiser needs to swap the destination registers.

This does not affect merging of two word loads, as we use ldp, which treats the memory as two separate 32-bit words.

Diff Detail

Event Timeline

olista01 updated this revision to Diff 39798.Nov 10 2015, 2:36 AM

olista01 retitled this revision from to [AArch64] Fix halfword load merging for big-endian targets.

olista01 updated this object.

olista01 added reviewers: junbuml, mcrosier.

olista01 set the repository for this revision to rL LLVM.

olista01 added a subscriber: llvm-commits.

Herald added subscribers: rengolin, aemerson. · View Herald TranscriptNov 10 2015, 2:36 AM

If the answer to my question is "yes", LGTM. If not, an assert somewhere would fix it. Thanks!

lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
568	I'm guessing the code makes it Rt2MI and RtMI the only two possible options?

This revision is now accepted and ready to land.Nov 10 2015, 2:57 AM

olista01 added inline comments.Nov 10 2015, 3:05 AM

lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
568	Yep, ExtDestMI and Rt2MI are both set to either I or Paired earlier in this function.

Committed revision 252597.

Thanks, Oliver!!!

junbuml mentioned this in D14183: [AArch64]Extend merging narrow loads into a wider load.Nov 12 2015, 9:44 AM

Revision Contents

Path

Size

lib/

Target/

AArch64/

AArch64LoadStoreOptimizer.cpp

12 lines

test/

CodeGen/

AArch64/

arm64-ldr-merge.ll

37 lines

Diff 39798

lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

Show First 20 Lines • Show All 78 Lines • ▼ Show 20 Lines
struct AArch64LoadStoreOpt : public MachineFunctionPass {		struct AArch64LoadStoreOpt : public MachineFunctionPass {
static char ID;		static char ID;
AArch64LoadStoreOpt() : MachineFunctionPass(ID) {		AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());		initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
}		}

const AArch64InstrInfo *TII;		const AArch64InstrInfo *TII;
const TargetRegisterInfo *TRI;		const TargetRegisterInfo *TRI;
		const AArch64Subtarget *Subtarget;

// Scan the instructions looking for a load/store that can be combined		// Scan the instructions looking for a load/store that can be combined
// with the current instruction into a load/store pair.		// with the current instruction into a load/store pair.
// Return the matching instruction if one is found, else MBB->end().		// Return the matching instruction if one is found, else MBB->end().
MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,		MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
LdStPairFlags &Flags,		LdStPairFlags &Flags,
unsigned Limit);		unsigned Limit);
// Merge the two instructions indicated into a single pair-wise instruction.		// Merge the two instructions indicated into a single pair-wise instruction.
▲ Show 20 Lines • Show All 437 Lines • ▼ Show 20 Lines	AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,

int OffsetImm = getLdStOffsetOp(RtMI).getImm();		int OffsetImm = getLdStOffsetOp(RtMI).getImm();

if (isSmallTypeLdMerge(Opc)) {		if (isSmallTypeLdMerge(Opc)) {
// Change the scaled offset from small to large type.		// Change the scaled offset from small to large type.
if (!IsUnscaled)		if (!IsUnscaled)
OffsetImm /= 2;		OffsetImm /= 2;
MachineInstr *RtNewDest = MergeForward ? I : Paired;		MachineInstr *RtNewDest = MergeForward ? I : Paired;
		// When merging small (< 32 bit) loads for big-endian targets, the order of
		// the component parts gets swapped.
		if (!Subtarget->isLittleEndian())
		std::swap(RtMI, Rt2MI);
// Construct the new load instruction.		// Construct the new load instruction.
// FIXME: currently we support only halfword unsigned load. We need to		// FIXME: currently we support only halfword unsigned load. We need to
// handle byte type, signed, and store instructions as well.		// handle byte type, signed, and store instructions as well.
MachineInstr NewMemMI, BitExtMI1, *BitExtMI2;		MachineInstr NewMemMI, BitExtMI1, *BitExtMI2;
NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),		NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
TII->get(NewOpc))		TII->get(NewOpc))
.addOperand(getLdStRegOp(RtNewDest))		.addOperand(getLdStRegOp(RtNewDest))
.addOperand(BaseRegOp)		.addOperand(BaseRegOp)
.addImm(OffsetImm);		.addImm(OffsetImm);

// Copy MachineMemOperands from the original loads.		// Copy MachineMemOperands from the original loads.
concatenateMemOperands(NewMemMI, I, Paired);		concatenateMemOperands(NewMemMI, I, Paired);

DEBUG(		DEBUG(
dbgs()		dbgs()
<< "Creating the new load and extract. Replacing instructions:\n ");		<< "Creating the new load and extract. Replacing instructions:\n ");
DEBUG(I->print(dbgs()));		DEBUG(I->print(dbgs()));
DEBUG(dbgs() << " ");		DEBUG(dbgs() << " ");
DEBUG(Paired->print(dbgs()));		DEBUG(Paired->print(dbgs()));
DEBUG(dbgs() << " with instructions:\n ");		DEBUG(dbgs() << " with instructions:\n ");
DEBUG((NewMemMI)->print(dbgs()));		DEBUG((NewMemMI)->print(dbgs()));

MachineInstr *ExtDestMI = MergeForward ? Paired : I;		MachineInstr *ExtDestMI = MergeForward ? Paired : I;
if (ExtDestMI == Rt2MI) {		if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
		rengolinUnsubmitted Not Done Reply Inline Actions I'm guessing the code makes it Rt2MI and RtMI the only two possible options? rengolin: I'm guessing the code makes it Rt2MI and RtMI the only two possible options?
		olista01AuthorUnsubmitted Not Done Reply Inline Actions Yep, ExtDestMI and Rt2MI are both set to either I or Paired earlier in this function. olista01: Yep, ExtDestMI and Rt2MI are both set to either I or Paired earlier in this function.
// Create the bitfield extract for high half.		// Create the bitfield extract for high half.
BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),		BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
TII->get(AArch64::UBFMWri))		TII->get(AArch64::UBFMWri))
.addOperand(getLdStRegOp(Rt2MI))		.addOperand(getLdStRegOp(Rt2MI))
.addReg(getLdStRegOp(RtNewDest).getReg())		.addReg(getLdStRegOp(RtNewDest).getReg())
.addImm(16)		.addImm(16)
.addImm(31);		.addImm(31);
// Create the bitfield extract for low half.		// Create the bitfield extract for low half.
▲ Show 20 Lines • Show All 811 Lines • ▼ Show 20 Lines	bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
// FIXME: The benefit from converting narrow loads into a wider load could be		// FIXME: The benefit from converting narrow loads into a wider load could be
// microarchitectural as it assumes that a single load with two bitfield		// microarchitectural as it assumes that a single load with two bitfield
// extracts is cheaper than two narrow loads. Currently, this conversion is		// extracts is cheaper than two narrow loads. Currently, this conversion is
// enabled only in cortex-a57 on which performance benefits were verified.		// enabled only in cortex-a57 on which performance benefits were verified.
return ProfitableArch & (!SubTarget->requiresStrictAlign());		return ProfitableArch & (!SubTarget->requiresStrictAlign());
}		}

bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {		bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo());		Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
TRI = Fn.getSubtarget().getRegisterInfo();		TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
		TRI = Subtarget->getRegisterInfo();

bool Modified = false;		bool Modified = false;
bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);		bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
for (auto &MBB : Fn)		for (auto &MBB : Fn)
Modified \|= optimizeBlock(MBB, enableNarrowLdOpt);		Modified \|= optimizeBlock(MBB, enableNarrowLdOpt);

return Modified;		return Modified;
}		}
Show All 9 Lines

test/CodeGen/AArch64/arm64-ldr-merge.ll

	; RUN: llc < %s -march=arm64 -mcpu=cortex-a57 -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -march=arm64 -mcpu=cortex-a57 -verify-machineinstrs \| FileCheck %s --check-prefix=CHECK --check-prefix=LE
				; RUN: llc < %s -march=aarch64_be -mcpu=cortex-a57 -verify-machineinstrs \| FileCheck %s --check-prefix=CHECK --check-prefix=BE

	; CHECK-LABEL: Ldrh_merge			; CHECK-LABEL: Ldrh_merge
	; CHECK-NOT: ldrh			; CHECK-NOT: ldrh
	; CHECK: ldr [[NEW_DEST:w[0-9]+]]			; CHECK: ldr [[NEW_DEST:w[0-9]+]]
	; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff			; CHECK-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff
	; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]]			; CHECK-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16
				; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
				; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
	define i16 @Ldrh_merge(i16* nocapture readonly %p) {			define i16 @Ldrh_merge(i16* nocapture readonly %p) {
	%1 = load i16, i16* %p, align 2			%1 = load i16, i16* %p, align 2
	%arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1			%arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1
	%2 = load i16, i16* %arrayidx2, align 2			%2 = load i16, i16* %arrayidx2, align 2
	%add = add nuw nsw i16 %1, %2			%add = sub nuw nsw i16 %1, %2
	ret i16 %add			ret i16 %add
	}			}

	; CHECK-LABEL: Ldurh_merge			; CHECK-LABEL: Ldurh_merge
	; CHECK-NOT: ldurh			; CHECK-NOT: ldurh
	; CHECK: ldur [[NEW_DEST:w[0-9]+]]			; CHECK: ldur [[NEW_DEST:w[0-9]+]]
	; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff			; CHECK-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff
	; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]]			; CHECK-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]]
				; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
				; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
	define i16 @Ldurh_merge(i16* nocapture readonly %p) {			define i16 @Ldurh_merge(i16* nocapture readonly %p) {
	entry:			entry:
	%arrayidx = getelementptr inbounds i16, i16* %p, i64 -2			%arrayidx = getelementptr inbounds i16, i16* %p, i64 -2
	%0 = load i16, i16* %arrayidx			%0 = load i16, i16* %arrayidx
	%arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1			%arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1
	%1 = load i16, i16* %arrayidx3			%1 = load i16, i16* %arrayidx3
	%add = add nuw nsw i16 %0, %1			%add = sub nuw nsw i16 %0, %1
	ret i16 %add			ret i16 %add
	}			}

	; CHECK-LABEL: Ldrh_4_merge			; CHECK-LABEL: Ldrh_4_merge
	; CHECK-NOT: ldrh			; CHECK-NOT: ldrh
	; CHECK: ldp [[NEW_DEST:w[0-9]+]]			; CHECK: ldp [[WORD1:w[0-9]+]], [[WORD2:w[0-9]+]], [x0]
				; CHECK-DAG: and [[WORD1LO:w[0-9]+]], [[WORD1]], #0xffff
				; CHECK-DAG: lsr [[WORD1HI:w[0-9]+]], [[WORD1]], #16
				; CHECK-DAG: and [[WORD2LO:w[0-9]+]], [[WORD2]], #0xffff
				; CHECK-DAG: lsr [[WORD2HI:w[0-9]+]], [[WORD2]], #16
				; LE-DAG: sub [[TEMP1:w[0-9]+]], [[WORD1HI]], [[WORD1LO]]
				; BE-DAG: sub [[TEMP1:w[0-9]+]], [[WORD1LO]], [[WORD1HI]]
				; LE: udiv [[TEMP2:w[0-9]+]], [[TEMP1]], [[WORD2LO]]
				; BE: udiv [[TEMP2:w[0-9]+]], [[TEMP1]], [[WORD2HI]]
				; LE: sub w0, [[TEMP2]], [[WORD2HI]]
				; BE: sub w0, [[TEMP2]], [[WORD2LO]]
	define i16 @Ldrh_4_merge(i16* nocapture readonly %P) {			define i16 @Ldrh_4_merge(i16* nocapture readonly %P) {
	%arrayidx = getelementptr inbounds i16, i16* %P, i64 0			%arrayidx = getelementptr inbounds i16, i16* %P, i64 0
	%l0 = load i16, i16* %arrayidx			%l0 = load i16, i16* %arrayidx
	%arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1			%arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1
	%l1 = load i16, i16* %arrayidx2			%l1 = load i16, i16* %arrayidx2
	%arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2			%arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2
	%l2 = load i16, i16* %arrayidx7			%l2 = load i16, i16* %arrayidx7
	%arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3			%arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3
	%l3 = load i16, i16* %arrayidx12			%l3 = load i16, i16* %arrayidx12
	%add4 = add nuw nsw i16 %l1, %l0			%add4 = sub nuw nsw i16 %l1, %l0
	%add9 = add nuw nsw i16 %add4, %l2			%add9 = udiv i16 %add4, %l2
	%add14 = add nuw nsw i16 %add9, %l3			%add14 = sub nuw nsw i16 %add9, %l3
	ret i16 %add14			ret i16 %add14
	}			}