This is an archive of the discontinued LLVM Phabricator instance.

[AArch64]Add support for converting halfword loads into a 32-bit word load
ClosedPublic

Authored by junbuml on Oct 15 2015, 7:43 AM.

Download Raw Diff

Details

Reviewers

mzolotukhin
ab
jmolloy
mcrosier

Summary

Convert two halfword loads into a single 32-bit word load with bitfield extract
instructions. For example :

ldrh w0, [x2]
ldrh w1, [x2, #2]

becomes

ldr w0, [x2]
ubfx w1, w0, #16, #16
and w0, w0, #ffff

Diff Detail

Event Timeline

junbuml updated this revision to Diff 37481.Oct 15 2015, 7:43 AM

junbuml retitled this revision from to [AArch64]Add support for converting halfword loads into a 32-bit word load.

junbuml updated this object.

junbuml added reviewers: mcrosier, mzolotukhin.

junbuml added a subscriber: llvm-commits.

Herald added subscribers: rengolin, aemerson. · View Herald TranscriptOct 15 2015, 7:43 AM

junbuml added a reviewer: ab.Oct 15 2015, 7:46 AM

FYI,
With this change 1.5% performance improvement was found in spec2006/h264ref.

Hi Jun,
Just a few general comments:

You might want to consider pulling this optimization into a separate loop before the loop the does the load and store pairing. I could see a case where you promote the ldrh+ldrh into a ldr and then that ldr gets paired with another ldr into a ldp.

ldrh w1, [x0]
ldrh w2, [x0, #2]
ldrh w3, [x0, #4]
ldrh w4, [x0, #8]

becomes something like:

ldp w1, w3 [x0]
ubfx w2, w1, #16, #16
ubfx w1, w1, #0, #16
ubfx w4, w3, #16, #16
ubfx w3, w3, #0, #16

For the suggestion in #1 to work you would need to append the MMOs to the new ldr instruction. Otherwise, the hasOrderedMemoryRef() check will fail.

Would is make sense to use an 'and w0, w0, 0xff' to zero the upper bits of the load? In other words, I'm wondering if a 'ubfx' is more expensive than an 'and' operation?

Chad

lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
172	Just delete and fall through.

mcrosier added a reviewer: jmolloy.Oct 15 2015, 9:27 AM

mcrosier added a subscriber: gberry.

I agree that implementing this in the separate loop will open up more pairing.
Okay, I will append MMO.
Yes, for low 16, AND should be preferred.

Thanks Chad for the review.

lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
172	Thanks. I will fix it.

mcrosier added inline comments.Oct 15 2015, 10:28 AM

lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
1181	I would also make this the first listed optimization. That way the documentation is consistent with the order in which the optimization occur. combining of halfword/small types opt load/store paring opt pre- and post-index opt

junbuml updated this revision to Diff 37590.Oct 16 2015, 8:26 AM

junbuml updated this object.

junbuml marked 3 inline comments as done.

LGTM, with one minor nit.

lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
1142	If we merge two ldrh instructions into a ldr we haven't actually created a paired instruction. How about we add another statistic to count when we merge narrow loads/stores into wider loads/stores?

This revision is now accepted and ready to land.Oct 19 2015, 7:27 AM

junbuml added inline comments.Oct 19 2015, 7:32 AM

lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
1142	Adding new statistic sounds good to me.

Committed in r250719

I tested it on A57 for spec2000 and spec2006. This patch was applied pretty widely, but clear performance was only observed in spce2006/h264ref (about 2%), and no performance regression was found. I can add a flag to turn it on only for a specific architecture. Please let me know any suggestion.

James,
Unfortunately, we're not setup to conduct performance testing using the llvm test-suite. Currently, we can only perform correctness testing, but we hope to have this resolved in the coming weeks.

Chad

Revision Contents

Path

Size

lib/

Target/

AArch64/

AArch64LoadStoreOptimizer.cpp

252 lines

test/

CodeGen/

AArch64/

arm64-ldp.ll

49 lines

Diff 37590

lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

Show First 20 Lines • Show All 71 Lines • ▼ Show 20 Lines	typedef struct LdStPairFlags {

void setSExtIdx(int V) { SExtIdx = V; }		void setSExtIdx(int V) { SExtIdx = V; }
int getSExtIdx() const { return SExtIdx; }		int getSExtIdx() const { return SExtIdx; }

} LdStPairFlags;		} LdStPairFlags;

struct AArch64LoadStoreOpt : public MachineFunctionPass {		struct AArch64LoadStoreOpt : public MachineFunctionPass {
static char ID;		static char ID;
AArch64LoadStoreOpt() : MachineFunctionPass(ID) {		AArch64LoadStoreOpt() : MachineFunctionPass(ID), IsStrictAlign(false) {
initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());		initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
}		}

const AArch64InstrInfo *TII;		const AArch64InstrInfo *TII;
const TargetRegisterInfo *TRI;		const TargetRegisterInfo *TRI;
		bool IsStrictAlign;

// Scan the instructions looking for a load/store that can be combined		// Scan the instructions looking for a load/store that can be combined
// with the current instruction into a load/store pair.		// with the current instruction into a load/store pair.
// Return the matching instruction if one is found, else MBB->end().		// Return the matching instruction if one is found, else MBB->end().
MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,		MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
LdStPairFlags &Flags,		LdStPairFlags &Flags,
unsigned Limit);		unsigned Limit);
// Merge the two instructions indicated into a single pair-wise instruction.		// Merge the two instructions indicated into a single pair-wise instruction.
Show All 23 Lines	struct AArch64LoadStoreOpt : public MachineFunctionPass {
bool isMatchingUpdateInsn(MachineInstr MemMI, MachineInstr MI,		bool isMatchingUpdateInsn(MachineInstr MemMI, MachineInstr MI,
unsigned BaseReg, int Offset);		unsigned BaseReg, int Offset);

// Merge a pre- or post-index base register update into a ld/st instruction.		// Merge a pre- or post-index base register update into a ld/st instruction.
MachineBasicBlock::iterator		MachineBasicBlock::iterator
mergeUpdateInsn(MachineBasicBlock::iterator I,		mergeUpdateInsn(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Update, bool IsPreIdx);		MachineBasicBlock::iterator Update, bool IsPreIdx);

		// Find and merge foldable ldr/str instructions.
		bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);

bool optimizeBlock(MachineBasicBlock &MBB);		bool optimizeBlock(MachineBasicBlock &MBB);

bool runOnMachineFunction(MachineFunction &Fn) override;		bool runOnMachineFunction(MachineFunction &Fn) override;

const char *getPassName() const override {		const char *getPassName() const override {
return AARCH64_LOAD_STORE_OPT_NAME;		return AARCH64_LOAD_STORE_OPT_NAME;
}		}
};		};
Show All 13 Lines	static bool isUnscaledLdSt(unsigned Opc) {
case AArch64::STURWi:		case AArch64::STURWi:
case AArch64::STURXi:		case AArch64::STURXi:
case AArch64::LDURSi:		case AArch64::LDURSi:
case AArch64::LDURDi:		case AArch64::LDURDi:
case AArch64::LDURQi:		case AArch64::LDURQi:
case AArch64::LDURWi:		case AArch64::LDURWi:
case AArch64::LDURXi:		case AArch64::LDURXi:
case AArch64::LDURSWi:		case AArch64::LDURSWi:
		case AArch64::LDURHHi:
return true;		return true;
}		}
}		}

static bool isUnscaledLdSt(MachineInstr *MI) {		static bool isUnscaledLdSt(MachineInstr *MI) {
return isUnscaledLdSt(MI->getOpcode());		return isUnscaledLdSt(MI->getOpcode());
}		}

		static bool isSmallTypeLdMerge(unsigned Opc) {
		switch (Opc) {
		default:
		return false;
		case AArch64::LDRHHui:
		case AArch64::LDURHHi:
		mcrosierUnsubmitted Done Reply Inline Actions Just delete and fall through. mcrosier: Just delete and fall through.
		junbumlAuthorUnsubmitted Done Reply Inline Actions Thanks. I will fix it. junbuml: Thanks. I will fix it.
		return true;
		// FIXME: Add other instructions (e.g, LDRBBui, LDURSHWi, LDRSHWui, etc.).
		}
		}

// Scaling factor for unscaled load or store.		// Scaling factor for unscaled load or store.
static int getMemScale(MachineInstr *MI) {		static int getMemScale(MachineInstr *MI) {
switch (MI->getOpcode()) {		switch (MI->getOpcode()) {
default:		default:
llvm_unreachable("Opcode has unknown scale!");		llvm_unreachable("Opcode has unknown scale!");
case AArch64::LDRBBui:		case AArch64::LDRBBui:
case AArch64::STRBBui:		case AArch64::STRBBui:
return 1;		return 1;
case AArch64::LDRHHui:		case AArch64::LDRHHui:
		case AArch64::LDURHHi:
case AArch64::STRHHui:		case AArch64::STRHHui:
return 2;		return 2;
case AArch64::LDRSui:		case AArch64::LDRSui:
case AArch64::LDURSi:		case AArch64::LDURSi:
case AArch64::LDRSWui:		case AArch64::LDRSWui:
case AArch64::LDURSWi:		case AArch64::LDURSWi:
case AArch64::LDRWui:		case AArch64::LDRWui:
case AArch64::LDURWi:		case AArch64::LDURWi:
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	static unsigned getMatchingNonSExtOpcode(unsigned Opc,
case AArch64::LDRWui:		case AArch64::LDRWui:
case AArch64::LDURWi:		case AArch64::LDURWi:
case AArch64::LDRXui:		case AArch64::LDRXui:
case AArch64::LDURXi:		case AArch64::LDURXi:
case AArch64::STRSui:		case AArch64::STRSui:
case AArch64::STURSi:		case AArch64::STURSi:
case AArch64::LDRSui:		case AArch64::LDRSui:
case AArch64::LDURSi:		case AArch64::LDURSi:
		case AArch64::LDRHHui:
		case AArch64::LDURHHi:
return Opc;		return Opc;
case AArch64::LDRSWui:		case AArch64::LDRSWui:
return AArch64::LDRWui;		return AArch64::LDRWui;
case AArch64::LDURSWi:		case AArch64::LDURSWi:
return AArch64::LDURWi;		return AArch64::LDURWi;
}		}
}		}

Show All 29 Lines	static unsigned getMatchingPairOpcode(unsigned Opc) {
case AArch64::LDURWi:		case AArch64::LDURWi:
return AArch64::LDPWi;		return AArch64::LDPWi;
case AArch64::LDRXui:		case AArch64::LDRXui:
case AArch64::LDURXi:		case AArch64::LDURXi:
return AArch64::LDPXi;		return AArch64::LDPXi;
case AArch64::LDRSWui:		case AArch64::LDRSWui:
case AArch64::LDURSWi:		case AArch64::LDURSWi:
return AArch64::LDPSWi;		return AArch64::LDPSWi;
		case AArch64::LDRHHui:
		return AArch64::LDRWui;
		case AArch64::LDURHHi:
		return AArch64::LDURWi;
}		}
}		}

static unsigned getPreIndexedOpcode(unsigned Opc) {		static unsigned getPreIndexedOpcode(unsigned Opc) {
switch (Opc) {		switch (Opc) {
default:		default:
llvm_unreachable("Opcode has no pre-indexed equivalent!");		llvm_unreachable("Opcode has no pre-indexed equivalent!");
case AArch64::STRSui:		case AArch64::STRSui:
▲ Show 20 Lines • Show All 141 Lines • ▼ Show 20 Lines	static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
return MI->getOperand(Idx);		return MI->getOperand(Idx);
}		}

static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {		static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
unsigned Idx = isPairedLdSt(MI) ? 3 : 2;		unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
return MI->getOperand(Idx);		return MI->getOperand(Idx);
}		}

		// Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI.
		static void concatenateMemOperands(MachineInstr MI, MachineInstr Op0,
		MachineInstr *Op1) {
		assert(MI->memoperands_empty() && "expected a new machineinstr");
		size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) +
		(Op1->memoperands_end() - Op1->memoperands_begin());

		MachineFunction *MF = MI->getParent()->getParent();
		MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
		MachineSDNode::mmo_iterator MemEnd =
		std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
		MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
		MI->setMemRefs(MemBegin, MemEnd);
		}

MachineBasicBlock::iterator		MachineBasicBlock::iterator
AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,		AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,		MachineBasicBlock::iterator Paired,
const LdStPairFlags &Flags) {		const LdStPairFlags &Flags) {
MachineBasicBlock::iterator NextI = I;		MachineBasicBlock::iterator NextI = I;
++NextI;		++NextI;
// If NextI is the second of the two instructions to be merged, we need		// If NextI is the second of the two instructions to be merged, we need
// to skip one further. Either way we merge will invalidate the iterator,		// to skip one further. Either way we merge will invalidate the iterator,
Show All 28 Lines	if (getLdStOffsetOp(I).getImm() ==
// I.e., we turn ldp I, Paired into ldp Paired, I.		// I.e., we turn ldp I, Paired into ldp Paired, I.
// Update the index accordingly.		// Update the index accordingly.
if (SExtIdx != -1)		if (SExtIdx != -1)
SExtIdx = (SExtIdx + 1) % 2;		SExtIdx = (SExtIdx + 1) % 2;
} else {		} else {
RtMI = I;		RtMI = I;
Rt2MI = Paired;		Rt2MI = Paired;
}		}

		int OffsetImm = getLdStOffsetOp(RtMI).getImm();;

		if (isSmallTypeLdMerge(Opc)) {
		// Change the scaled offset from small to large type.
		if (!IsUnscaled)
		OffsetImm /= 2;
		MachineInstr *RtNewDest = MergeForward ? I : Paired;
		// Construct the new load instruction.
		// FIXME: currently we support only halfword unsigned load. We need to
		// handle byte type, signed, and store instructions as well.
		MachineInstr NewMemMI, BitExtMI1, *BitExtMI2;
		NewMemMI = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
		.addOperand(getLdStRegOp(RtNewDest))
		.addOperand(BaseRegOp)
		.addImm(OffsetImm);

		// Copy MachineMemOperands from the original loads.
		concatenateMemOperands(NewMemMI, I, Paired);

		DEBUG(
		dbgs()
		<< "Creating the new load and extract. Replacing instructions:\n ");
		DEBUG(I->print(dbgs()));
		DEBUG(dbgs() << " ");
		DEBUG(Paired->print(dbgs()));
		DEBUG(dbgs() << " with instructions:\n ");
		DEBUG((NewMemMI)->print(dbgs()));

		MachineInstr *ExtDestMI = MergeForward ? Paired : I;
		if (ExtDestMI == Rt2MI) {
		// Create the bitfield extract for high half.
		BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
		TII->get(AArch64::UBFMWri))
		.addOperand(getLdStRegOp(Rt2MI))
		.addReg(getLdStRegOp(RtNewDest).getReg())
		.addImm(16)
		.addImm(31);
		// Create the bitfield extract for low half.
		BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
		TII->get(AArch64::ANDWri))
		.addOperand(getLdStRegOp(RtMI))
		.addReg(getLdStRegOp(RtNewDest).getReg())
		.addImm(15);
		} else {
		// Create the bitfield extract for low half.
		BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
		TII->get(AArch64::ANDWri))
		.addOperand(getLdStRegOp(RtMI))
		.addReg(getLdStRegOp(RtNewDest).getReg())
		.addImm(15);
		// Create the bitfield extract for high half.
		BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
		TII->get(AArch64::UBFMWri))
		.addOperand(getLdStRegOp(Rt2MI))
		.addReg(getLdStRegOp(RtNewDest).getReg())
		.addImm(16)
		.addImm(31);
		}
		DEBUG(dbgs() << " ");
		DEBUG((BitExtMI1)->print(dbgs()));
		DEBUG(dbgs() << " ");
		DEBUG((BitExtMI2)->print(dbgs()));
		DEBUG(dbgs() << "\n");

		// Erase the old instructions.
		I->eraseFromParent();
		Paired->eraseFromParent();
		return NextI;
		}

// Handle Unscaled		// Handle Unscaled
int OffsetImm = getLdStOffsetOp(RtMI).getImm();
if (IsUnscaled)		if (IsUnscaled)
OffsetImm /= OffsetStride;		OffsetImm /= OffsetStride;

// Construct the new instruction.		// Construct the new instruction.
MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,		MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
I->getDebugLoc(), TII->get(NewOpc))		I->getDebugLoc(), TII->get(NewOpc))
.addOperand(getLdStRegOp(RtMI))		.addOperand(getLdStRegOp(RtMI))
.addOperand(getLdStRegOp(Rt2MI))		.addOperand(getLdStRegOp(Rt2MI))
▲ Show 20 Lines • Show All 120 Lines • ▼ Show 20 Lines	static bool mayAlias(MachineInstr *MIa,

return false;		return false;
}		}

/// findMatchingInsn - Scan the instructions looking for a load/store that can		/// findMatchingInsn - Scan the instructions looking for a load/store that can
/// be combined with the current instruction into a load/store pair.		/// be combined with the current instruction into a load/store pair.
MachineBasicBlock::iterator		MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,		AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
LdStPairFlags &Flags,		LdStPairFlags &Flags, unsigned Limit) {
unsigned Limit) {
MachineBasicBlock::iterator E = I->getParent()->end();		MachineBasicBlock::iterator E = I->getParent()->end();
MachineBasicBlock::iterator MBBI = I;		MachineBasicBlock::iterator MBBI = I;
MachineInstr *FirstMI = I;		MachineInstr *FirstMI = I;
++MBBI;		++MBBI;

unsigned Opc = FirstMI->getOpcode();		unsigned Opc = FirstMI->getOpcode();
bool MayLoad = FirstMI->mayLoad();		bool MayLoad = FirstMI->mayLoad();
bool IsUnscaled = isUnscaledLdSt(FirstMI);		bool IsUnscaled = isUnscaledLdSt(FirstMI);
unsigned Reg = getLdStRegOp(FirstMI).getReg();		unsigned Reg = getLdStRegOp(FirstMI).getReg();
unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();		unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
int Offset = getLdStOffsetOp(FirstMI).getImm();		int Offset = getLdStOffsetOp(FirstMI).getImm();

// Early exit if the first instruction modifies the base register.		// Early exit if the first instruction modifies the base register.
// e.g., ldr x0, [x0]		// e.g., ldr x0, [x0]
if (FirstMI->modifiesRegister(BaseReg, TRI))		if (FirstMI->modifiesRegister(BaseReg, TRI))
return E;		return E;

// Early exit if the offset if not possible to match. (6 bits of positive		// Early exit if the offset if not possible to match. (6 bits of positive
// range, plus allow an extra one in case we find a later insn that matches		// range, plus allow an extra one in case we find a later insn that matches
// with Offset-1)		// with Offset-1)
int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;		int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))		if (!isSmallTypeLdMerge(Opc) &&
		!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
return E;		return E;

// Track which registers have been modified and used between the first insn		// Track which registers have been modified and used between the first insn
// (inclusive) and the second insn.		// (inclusive) and the second insn.
BitVector ModifiedRegs, UsedRegs;		BitVector ModifiedRegs, UsedRegs;
ModifiedRegs.resize(TRI->getNumRegs());		ModifiedRegs.resize(TRI->getNumRegs());
UsedRegs.resize(TRI->getNumRegs());		UsedRegs.resize(TRI->getNumRegs());

▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines	if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) {
// If this is a volatile load/store that otherwise matched, stop looking		// If this is a volatile load/store that otherwise matched, stop looking
// as something is going on that we don't have enough information to		// as something is going on that we don't have enough information to
// safely transform. Similarly, stop if we see a hint to avoid pairs.		// safely transform. Similarly, stop if we see a hint to avoid pairs.
if (MI->hasOrderedMemoryRef() \|\| TII->isLdStPairSuppressed(MI))		if (MI->hasOrderedMemoryRef() \|\| TII->isLdStPairSuppressed(MI))
return E;		return E;
// If the resultant immediate offset of merging these instructions		// If the resultant immediate offset of merging these instructions
// is out of range for a pairwise instruction, bail and keep looking.		// is out of range for a pairwise instruction, bail and keep looking.
bool MIIsUnscaled = isUnscaledLdSt(MI);		bool MIIsUnscaled = isUnscaledLdSt(MI);
if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {		bool IsSmallTypeLd = isSmallTypeLdMerge(MI->getOpcode());
		if (!IsSmallTypeLd &&
		!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);		trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
MemInsns.push_back(MI);		MemInsns.push_back(MI);
continue;		continue;
}		}

		if (IsSmallTypeLd) {
		// If the alignment requirements of the larger type scaled load
		// instruction can't express the scaled offset of the smaller type
		// input, bail and keep looking.
		if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) {
		trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
		MemInsns.push_back(MI);
		continue;
		}
		} else {
// If the alignment requirements of the paired (scaled) instruction		// If the alignment requirements of the paired (scaled) instruction
// can't express the offset of the unscaled input, bail and keep		// can't express the offset of the unscaled input, bail and keep
// looking.		// looking.
if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {		if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);		trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
MemInsns.push_back(MI);		MemInsns.push_back(MI);
continue;		continue;
}		}
		}
// If the destination register of the loads is the same register, bail		// If the destination register of the loads is the same register, bail
// and keep looking. A load-pair instruction with both destination		// and keep looking. A load-pair instruction with both destination
// registers the same is UNPREDICTABLE and will result in an exception.		// registers the same is UNPREDICTABLE and will result in an exception.
if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {		if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);		trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
MemInsns.push_back(MI);		MemInsns.push_back(MI);
continue;		continue;
}		}
▲ Show 20 Lines • Show All 263 Lines • ▼ Show 20 Lines	for (unsigned Count = 0; MBBI != B; --MBBI) {
// Otherwise, if the base register is used or modified, we have no match, so		// Otherwise, if the base register is used or modified, we have no match, so
// return early.		// return early.
if (ModifiedRegs[BaseReg] \|\| UsedRegs[BaseReg])		if (ModifiedRegs[BaseReg] \|\| UsedRegs[BaseReg])
return E;		return E;
}		}
return E;		return E;
}		}

		bool AArch64LoadStoreOpt::tryToMergeLdStInst(
		MachineBasicBlock::iterator &MBBI) {
		MachineInstr *MI = MBBI;
		MachineBasicBlock::iterator E = MI->getParent()->end();
		// If this is a volatile load/store, don't mess with it.
		if (MI->hasOrderedMemoryRef())
		return false;

		// Make sure this is a reg+imm (as opposed to an address reloc).
		if (!getLdStOffsetOp(MI).isImm())
		return false;

		// Check if this load/store has a hint to avoid pair formation.
		// MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
		if (TII->isLdStPairSuppressed(MI))
		return false;

		// Look ahead up to ScanLimit instructions for a pairable instruction.
		LdStPairFlags Flags;
		MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit);
		if (Paired != E) {
		++NumPairCreated;
		mcrosierUnsubmitted Not Done Reply Inline Actions If we merge two ldrh instructions into a ldr we haven't actually created a paired instruction. How about we add another statistic to count when we merge narrow loads/stores into wider loads/stores? mcrosier: If we merge two ldrh instructions into a ldr we haven't actually created a paired instruction.
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions Adding new statistic sounds good to me. junbuml: Adding new statistic sounds good to me.
		if (isUnscaledLdSt(MI))
		++NumUnscaledPairCreated;

		// Merge the loads into a pair. Keeping the iterator straight is a
		// pain, so we let the merge routine tell us what the next instruction
		// is after it's done mucking about.
		MBBI = mergePairedInsns(MBBI, Paired, Flags);
		return true;
		}
		return false;
		}

bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {		bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
bool Modified = false;		bool Modified = false;
// Two tranformations to do here:		// Three tranformations to do here:
// 1) Find loads and stores that can be merged into a single load or store		// 1) Find halfword loads that can be merged into a single 32-bit word load
		// with bitfield extract instructions.
		// e.g.,
		// ldrh w0, [x2]
		// ldrh w1, [x2, #2]
		// ; becomes
		// ldr w0, [x2]
		// ubfx w1, w0, #16, #16
		// and w0, w0, #ffff
		// 2) Find loads and stores that can be merged into a single load or store
// pair instruction.		// pair instruction.
// e.g.,		// e.g.,
// ldr x0, [x2]		// ldr x0, [x2]
// ldr x1, [x2, #8]		// ldr x1, [x2, #8]
// ; becomes		// ; becomes
// ldp x0, x1, [x2]		// ldp x0, x1, [x2]
// 2) Find base register updates that can be merged into the load or store		// 3) Find base register updates that can be merged into the load or store
// as a base-reg writeback.		// as a base-reg writeback.
// e.g.,		// e.g.,
// ldr x0, [x2]		// ldr x0, [x2]
// add x2, x2, #4		// add x2, x2, #4
// ; becomes		// ; becomes
// ldr x0, [x2], #4		// ldr x0, [x2], #4

		mcrosierUnsubmitted Done Reply Inline Actions I would also make this the first listed optimization. That way the documentation is consistent with the order in which the optimization occur. combining of halfword/small types opt load/store paring opt pre- and post-index opt mcrosier: I would also make this the first listed optimization. That way the documentation is consistent…
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();		for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
		!IsStrictAlign && MBBI != E;) {
		MachineInstr *MI = MBBI;
		switch (MI->getOpcode()) {
		default:
		// Just move on to the next instruction.
		++MBBI;
		break;
		// Scaled instructions.
		case AArch64::LDRHHui:
		// Unscaled instructions.
		case AArch64::LDURHHi: {
		if (tryToMergeLdStInst(MBBI)) {
		Modified = true;
		break;
		}
		++MBBI;
		break;
		}
		// FIXME: Do the other instructions.
		}
		}

		for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
MBBI != E;) {		MBBI != E;) {
MachineInstr *MI = MBBI;		MachineInstr *MI = MBBI;
switch (MI->getOpcode()) {		switch (MI->getOpcode()) {
default:		default:
// Just move on to the next instruction.		// Just move on to the next instruction.
++MBBI;		++MBBI;
break;		break;
// Scaled instructions.		// Scaled instructions.
Show All 15 Lines	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
case AArch64::STURWi:		case AArch64::STURWi:
case AArch64::STURXi:		case AArch64::STURXi:
case AArch64::LDURSi:		case AArch64::LDURSi:
case AArch64::LDURDi:		case AArch64::LDURDi:
case AArch64::LDURQi:		case AArch64::LDURQi:
case AArch64::LDURWi:		case AArch64::LDURWi:
case AArch64::LDURXi:		case AArch64::LDURXi:
case AArch64::LDURSWi: {		case AArch64::LDURSWi: {
// If this is a volatile load/store, don't mess with it.		if (tryToMergeLdStInst(MBBI)) {
if (MI->hasOrderedMemoryRef()) {
++MBBI;
break;
}
// Make sure this is a reg+imm (as opposed to an address reloc).
if (!getLdStOffsetOp(MI).isImm()) {
++MBBI;
break;
}
// Check if this load/store has a hint to avoid pair formation.
// MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
if (TII->isLdStPairSuppressed(MI)) {
++MBBI;
break;
}
// Look ahead up to ScanLimit instructions for a pairable instruction.
LdStPairFlags Flags;
MachineBasicBlock::iterator Paired =
findMatchingInsn(MBBI, Flags, ScanLimit);
if (Paired != E) {
++NumPairCreated;
if (isUnscaledLdSt(MI))
++NumUnscaledPairCreated;

// Merge the loads into a pair. Keeping the iterator straight is a
// pain, so we let the merge routine tell us what the next instruction
// is after it's done mucking about.
MBBI = mergePairedInsns(MBBI, Paired, Flags);
Modified = true;		Modified = true;
break;		break;
}		}
++MBBI;		++MBBI;
break;		break;
}		}
// FIXME: Do the other instructions.		// FIXME: Do the other instructions.
}		}
▲ Show 20 Lines • Show All 115 Lines • ▼ Show 20 Lines	bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
}		}

return Modified;		return Modified;
}		}

bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {		bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo());		TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo());
TRI = Fn.getSubtarget().getRegisterInfo();		TRI = Fn.getSubtarget().getRegisterInfo();
		IsStrictAlign = (static_cast<const AArch64Subtarget &>(Fn.getSubtarget()))
		.requiresStrictAlign();

bool Modified = false;		bool Modified = false;
for (auto &MBB : Fn)		for (auto &MBB : Fn)
Modified \|= optimizeBlock(MBB);		Modified \|= optimizeBlock(MBB);

return Modified;		return Modified;
}		}

// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep		// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
// loads and stores near one another?		// loads and stores near one another?

/// createAArch64LoadStoreOptimizationPass - returns an instance of the		/// createAArch64LoadStoreOptimizationPass - returns an instance of the
/// load / store optimization pass.		/// load / store optimization pass.
FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {		FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
return new AArch64LoadStoreOpt();		return new AArch64LoadStoreOpt();
}		}

test/CodeGen/AArch64/arm64-ldp.ll

Show First 20 Lines • Show All 349 Lines • ▼ Show 20 Lines	define i64 @ldp_sext_int_post(i32* %p) nounwind {
%tmp1 = load i32, i32* %add.ptr, align 4		%tmp1 = load i32, i32* %add.ptr, align 4
%sexttmp = sext i32 %tmp to i64		%sexttmp = sext i32 %tmp to i64
%sexttmp1 = sext i32 %tmp1 to i64		%sexttmp1 = sext i32 %tmp1 to i64
%ptr = getelementptr inbounds i32, i32* %add.ptr, i64 1		%ptr = getelementptr inbounds i32, i32* %add.ptr, i64 1
call void @use-ptr(i32* %ptr)		call void @use-ptr(i32* %ptr)
%add = add nsw i64 %sexttmp1, %sexttmp		%add = add nsw i64 %sexttmp1, %sexttmp
ret i64 %add		ret i64 %add
}		}

		; CHECK-LABEL: Ldrh_merge
		; CHECK-NOT: ldrh
		; CHECK: ldr [[NEW_DEST:w[0-9]+]]
		; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff
		; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]]

		define i16 @Ldrh_merge(i16* nocapture readonly %p) {
		%1 = load i16, i16* %p, align 2
		;%conv = zext i16 %0 to i32
		%arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1
		%2 = load i16, i16* %arrayidx2, align 2
		%add = add nuw nsw i16 %1, %2
		ret i16 %add
		}

		; CHECK-LABEL: Ldurh_merge
		; CHECK-NOT: ldurh
		; CHECK: ldur [[NEW_DEST:w[0-9]+]]
		; CHECK: and w{{[0-9]+}}, [[NEW_DEST]], #0xffff
		; CHECK: lsr w{{[0-9]+}}, [[NEW_DEST]]
		define i16 @Ldurh_merge(i16* nocapture readonly %p) {
		entry:
		%arrayidx = getelementptr inbounds i16, i16* %p, i64 -2
		%0 = load i16, i16* %arrayidx
		%arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1
		%1 = load i16, i16* %arrayidx3
		%add = add nuw nsw i16 %0, %1
		ret i16 %add
		}

		; CHECK-LABEL: Ldrh_4_merge
		; CHECK-NOT: ldrh
		; CHECK: ldp [[NEW_DEST:w[0-9]+]]
		define i16 @Ldrh_4_merge(i16* nocapture readonly %P) {
		%arrayidx = getelementptr inbounds i16, i16* %P, i64 0
		%l0 = load i16, i16* %arrayidx
		%arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1
		%l1 = load i16, i16* %arrayidx2
		%arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2
		%l2 = load i16, i16* %arrayidx7
		%arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3
		%l3 = load i16, i16* %arrayidx12
		%add4 = add nuw nsw i16 %l1, %l0
		%add9 = add nuw nsw i16 %add4, %l2
		%add14 = add nuw nsw i16 %add9, %l3

		ret i16 %add14
		}