This is an archive of the discontinued LLVM Phabricator instance.

Differential D30903

[AArch64] Use alias analysis in the load/store optimization pass.
ClosedPublic

Authored by mcrosier on Mar 13 2017, 11:00 AM.

Download Raw Diff

Details

Reviewers

rengolin
eli.friedman
gberry
MatzeB

Commits

rGa69dcb6b66cb: [AArch64] Use alias analysis in the load/store optimization pass.
rL298092: [AArch64] Use alias analysis in the load/store optimization pass.

Summary

This allows the optimization to rearrange loads and stores more aggressively to form more load/store pairs. Saw a number of fairly significant code size reductions. Performance testing in flight.

Chad

Diff Detail

Event Timeline

mcrosier created this revision.Mar 13 2017, 11:00 AM

Herald added a subscriber: aemerson. · View Herald TranscriptMar 13 2017, 11:00 AM

mcrosier mentioned this in D30839: [ARM] Use alias analysis in ARMPreAllocLoadStoreOpt..Mar 13 2017, 11:02 AM

I would be interested to see load/store pair stats differences and compile time impact of this change.

In D30903#699555, @gberry wrote:

I would be interested to see load/store pair stats differences and compile time impact of this change.

I'll work on getting the compile-time difference, Geoff.

Here are the static opcode diffs for SPEC2000 for those benchmarks with more that 5 static instructions removed:

> ./spec2000/eon/diffs/eon_base.arm_linux.diff <

Opcode static count diff summary:

   -30  ldr x [x #]
   -14  str x [x #]
    -4  str w [x #]
    -2  ldr d [x #]
    -2  ldr q [x #]
    -2  stp w w [x #]
    -2  str q [x #]
     1  ldp d d [x #]
     1  ldp q q [x #]
     1  stp q q [x #]
     7  ldp x x [x #]
     9  stp x x [x #]
    13  mov x x
-------------------------
    32  added (excluding nops)
    56  removed (excluding nops)
   -24  net (excluding nops)

> ./spec2000/twolf/diffs/twolf_base.arm_linux.diff <

Opcode static count diff summary:

    -8  str w [x #]
    -7  ldr w [x #]
    -3  ldrsw x [x #]
     1  sxtw x w
     1  ldpsw x x [x #]
     4  ldp w w [x #]
     4  stp w w [x #]
-------------------------
    10  added (excluding nops)
    18  removed (excluding nops)
    -8  net (excluding nops)

> ./spec2000/gcc/diffs/cc1_base.arm_linux.diff <

Opcode static count diff summary:

   -38  ldr x [x #]
   -31  str x [x #]
   -12  str w [x #]
    -8  ldr w [x #]
     4  ldp w w [x #]
     5  stp w w [x #]
    16  stp x x [x #]
    19  ldp x x [x #]
-------------------------
    44  added (excluding nops)
    89  removed (excluding nops)
   -45  net (excluding nops)

> ./spec2000/perlbmk/diffs/perlbmk_base.arm_linux.diff <

Opcode static count diff summary:

   -32  ldr x [x #]
   -16  str x [x #]
    -8  ldr w [x #]
    -2  str w [x #]
     1  stp w w [x #]
     4  ldp w w [x #]
     8  stp x x [x #]
    16  ldp x x [x #]
-------------------------
    29  added (excluding nops)
    58  removed (excluding nops)
   -29  net (excluding nops)

> ./spec2000/crafty/diffs/crafty_base.arm_linux.diff <

Opcode static count diff summary:

   -22  str x [x #]
    -8  str w [x #]
    -6  ldr x [x #]
     3  ldp x x [x #]
     4  stp w w [x #]
    11  stp x x [x #]
-------------------------
    18  added (excluding nops)
    36  removed (excluding nops)
   -18  net (excluding nops)

> ./spec2000/mesa/diffs/mesa_base.arm_linux.diff <

Opcode static count diff summary:

   -66  ldr s [x #]
   -36  str q [x #]
   -24  str s [x #]
   -14  ldr w [x #]
   -12  str q [x #] #
    -2  str w [x #]
    -2  ldr q [x #]
     1  ldp q q [x #]
     1  stp w w [x #]
     7  ldp w w [x #]
    12  stp s s [x #]
    12  add x x #
    24  stp q q [x #]
    33  ldp s s [x #]
-------------------------
    90  added (excluding nops)
   156  removed (excluding nops)
   -66  net (excluding nops)

> ./spec2000/vortex/diffs/vortex_base.arm_linux.diff <

Opcode static count diff summary:

  -168  str x [x #]
   -38  str w [x #]
   -16  ldr w [x #]
    -4  ldr x [x #]
     2  ldp x x [x #]
     8  ldp w w [x #]
    15  stp w w [x #]
    86  stp x x [x #]
-------------------------
   111  added (excluding nops)
   226  removed (excluding nops)
  -115  net (excluding nops)

Here are the static opcode diffs for SPEC2006 for those benchmarks with more that 5 static instructions removed:

> ./spec2006/h264ref/diffs/h264ref_base.arm_linux.diff <

Opcode static count diff summary:

   -95  str w [x #]
   -62  str x [x #]
   -24  ldr w [x #]
    -6  ldr x [x #]
    -2  ldr q [x #]
    -2  str q [x #]
    -1  scvtf s w
    -1  fdiv s s s
    -1  adrp x  
    -1  fcvtzs w s
    -1  add x x #
     1  ldp q q [x #]
     1  stp q q [x #]
     3  ldp x x [x #]
    12  ldp w w [x #]
    33  stp x x [x #]
    43  stp w w [x #]
-------------------------
    93  added (excluding nops)
   196  removed (excluding nops)
  -103  net (excluding nops)

> ./spec2006/povray/diffs/povray_base.arm_linux.diff <

Opcode static count diff summary:

   -65  str x [x #]
   -42  ldr x [x #]
    -8  ldr s [x #]
    -6  ldr d [x #]
    -6  str s [x #]
    -6  str w [x #]
    -5  ldr w [x #]
    -4  ldr q [x #]
    -4  str q [x #]
    -4  str d [x #]
    -1  ldrsw x [x #]
     1  sxtw x w
     2  stp q q [x #]
     2  ldp q q [x #]
     2  stp w w [x #]
     2  stp d d [x #]
     3  ldp d d [x #]
     3  stp s s [x #]
     3  ldp w w [x #]
     4  ldp s s [x #]
    21  ldp x x [x #]
    33  stp x x [x #]
-------------------------
    76  added (excluding nops)
   151  removed (excluding nops)
   -75  net (excluding nops)

> ./spec2006/gcc/diffs/gcc_base.arm_linux.diff <

Opcode static count diff summary:

   -62  str x [x #]
   -20  ldr x [x #]
   -14  str w [x #]
    -2  ldr w [x #]
     1  ldp w w [x #]
     7  stp w w [x #]
    10  ldp x x [x #]
    31  stp x x [x #]
-------------------------
    49  added (excluding nops)
    98  removed (excluding nops)
   -49  net (excluding nops)

> ./spec2006/perlbench/diffs/perlbench_base.arm_linux.diff <

Opcode static count diff summary:

   -50  ldr x [x #]
   -19  str x [x #]
   -10  ldr w [x #]
    -8  str w [x #]
     3  stp w w [x #]
     5  ldp w w [x #]
    10  stp x x [x #]
    25  ldp x x [x #]
-------------------------
    43  added (excluding nops)
    87  removed (excluding nops)
   -44  net (excluding nops)

> ./spec2006/dealII/diffs/dealII_base.arm_linux.diff <

Opcode static count diff summary:

  -362  str x [x #]
   -64  str w [x #]
   -42  ldr x [x #]
   -12  ldr q [x #]
    -8  str q [x #]
    -2  ldr w [x #]
    -2  str d [x #]
    -1  sub x x #
     1  stp x x [x #] #
     1  ldp w w [x #]
     1  stp d d [x #]
     4  stp q q [x #]
     6  ldp q q [x #]
    21  ldp x x [x #]
    32  stp w w [x #]
   180  stp x x [x #]
-------------------------
   246  added (excluding nops)
   493  removed (excluding nops)
  -247  net (excluding nops)

> ./spec2006/xalancbmk/diffs/Xalan_base.arm_linux.diff <

Opcode static count diff summary:

   -86  str x [x #]
   -23  ldr x [x #]
    -8  str q [x #]
    -4  ldr q [x #]
     1  mov x x
     2  ldp q q [x #]
     4  stp q q [x #]
    11  ldp x x [x #]
    43  stp x x [x #]
-------------------------
    61  added (excluding nops)
   121  removed (excluding nops)
   -60  net (excluding nops)

> ./spec2006/gobmk/diffs/gobmk_base.arm_linux.diff <

Opcode static count diff summary:

   -36  str w [x #]
    -4  ldr w [x #]
    -2  ldr q [x #]
     1  ldp q q [x #]
     2  ldp w w [x #]
    18  stp w w [x #]
-------------------------
    21  added (excluding nops)
    42  removed (excluding nops)
   -21  net (excluding nops)

Here are the relative stats for SPEC2000/SPEC2006 combine using llvm statistics:

Message                                                                                    Diff  %age
--------------------------------------------------------------------------------------  -------  -------
aarch64-ldst-opt - Number of load/store pair instructions generated                        +969  1.63%
aarch64-ldst-opt - Number of loads from stores promoted                                    +232  362.50%
aarch64-ldst-opt - Number of narrow zero stores promoted                                    +20  2.41%
aarch64-ldst-opt - Number of post-index updates folded                                       -3  -0.12%
asm-printer - Number of machine instrs printed                                             -974  -0.03%
assembler - Number of emitted object file bytes                                           -3616  -0.01%
assembler - Number of evaluated fixups                                                       +8  0.00%
assembler - Number of fragment layouts                                                       +0  0.00%
basicaa - Number of times a GEP is decomposed                                           +125690  0.28%
basicaa - Number of times the limit to decompose GEPs is reached                            +43  0.04%
bdce - Number of instructions removed (unused)                                               +0  0.00%
bdce - Number of instructions trivialized (dead bits)                                        +0  0.00%
bitcode-reader - Number of MDStrings loaded                                                  +0  0.00%
branch-relaxation - Number of conditional branches relaxed                                   +0  0.00%
branchfolding - Number of block tails merged                                                 -6  -0.01%
mccodeemitter - Number of MC fixups created.                                                 +8  0.00%
mccodeemitter - Number of MC instructions emitted.                                         -974  -0.03%
mcexpr - Number of MCExpr evaluations                                                       +16  0.00%
memory-builtins - Number of arguments with unsolved size and offset                         +32  0.04%
memory-builtins - Number of load instructions with unsolved size and offset                 +84  0.16%

Looks good to me. Could you do the sanity checking that we do the right thing in CodeGen and compute the AliasAnalysis information only once for all of CodeGen and not repeat it for different passes? (i.e. -debug-pass=Executions should only show them computed once for all CodeGen passes). Feel free to delegate this task to https://reviews.llvm.org/D30839 if you want ;-)

evandro added a subscriber: evandro.Mar 13 2017, 2:19 PM

In D30903#699715, @MatzeB wrote:

Looks good to me. Could you do the sanity checking that we do the right thing in CodeGen and compute the AliasAnalysis information only once for all of CodeGen and not repeat it for different passes? (i.e. -debug-pass=Executions should only show them computed once for all CodeGen passes). Feel free to delegate this task to https://reviews.llvm.org/D30839 if you want ;-)

Sure, Matthias. After this change the Function AA result is freed after the AArch64 load/store optimization pass, rather than the Machine LICM pass (i.e., the change only extends the lifetime of the AA info and doesn't not require it to be recomputed).

Performance results for SPEC2000/2006 are neutral, so this is mostly just a code size reduction optimization.

In D30903#700707, @mcrosier wrote:

In D30903#699715, @MatzeB wrote:

Looks good to me. Could you do the sanity checking that we do the right thing in CodeGen and compute the AliasAnalysis information only once for all of CodeGen and not repeat it for different passes? (i.e. -debug-pass=Executions should only show them computed once for all CodeGen passes). Feel free to delegate this task to https://reviews.llvm.org/D30839 if you want ;-)

Sure, Matthias. After this change the Function AA result is freed after the AArch64 load/store optimization pass, rather than the Machine LICM pass (i.e., the change only extends the lifetime of the AA info and doesn't not require it to be recomputed).

That's fine; I just wanted to make sure we do not compute it twice because something in codegen fails to preserve.

The code change itself is obvious. LGTM.

This revision is now accepted and ready to land.Mar 14 2017, 10:54 AM

Compile-time regression tests on the llvm-test-suite and SPEC200X resulted in a net 1.288% improvement in compile time. I suspect that's really just noise, but the main take away is that there were no regression identified. Will commit soon..

Closed by commit rL298092: [AArch64] Use alias analysis in the load/store optimization pass. (authored by mcrosier). · Explain WhyMar 17 2017, 7:32 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

AArch64/

AArch64LoadStoreOptimizer.cpp

21 lines

test/

CodeGen/

AArch64/

ldst-opt-aa.mir

30 lines

Diff 91591

lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

	Show First 20 Lines • Show All 87 Lines • ▼ Show 20 Lines

	struct AArch64LoadStoreOpt : public MachineFunctionPass {			struct AArch64LoadStoreOpt : public MachineFunctionPass {
	static char ID;			static char ID;

	AArch64LoadStoreOpt() : MachineFunctionPass(ID) {			AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
	initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());			initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
	}			}

				AliasAnalysis *AA;
	const AArch64InstrInfo *TII;			const AArch64InstrInfo *TII;
	const TargetRegisterInfo *TRI;			const TargetRegisterInfo *TRI;
	const AArch64Subtarget *Subtarget;			const AArch64Subtarget *Subtarget;

	// Track which registers have been modified and used.			// Track which registers have been modified and used.
	BitVector ModifiedRegs, UsedRegs;			BitVector ModifiedRegs, UsedRegs;

				virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
				AU.addRequired<AAResultsWrapperPass>();
				MachineFunctionPass::getAnalysisUsage(AU);
				}

	// Scan the instructions looking for a load/store that can be combined			// Scan the instructions looking for a load/store that can be combined
	// with the current instruction into a load/store pair.			// with the current instruction into a load/store pair.
	// Return the matching instruction if one is found, else MBB->end().			// Return the matching instruction if one is found, else MBB->end().
	MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,			MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
	LdStPairFlags &Flags,			LdStPairFlags &Flags,
	unsigned Limit,			unsigned Limit,
	bool FindNarrowMerge);			bool FindNarrowMerge);

	▲ Show 20 Lines • Show All 820 Lines • ▼ Show 20 Lines
	// avoiding having to do a C-style cast from uint_64t to int when			// avoiding having to do a C-style cast from uint_64t to int when
	// using alignTo from include/llvm/Support/MathExtras.h.			// using alignTo from include/llvm/Support/MathExtras.h.
	// FIXME: Move this function to include/MathExtras.h?			// FIXME: Move this function to include/MathExtras.h?
	static int alignTo(int Num, int PowOf2) {			static int alignTo(int Num, int PowOf2) {
	return (Num + PowOf2 - 1) & ~(PowOf2 - 1);			return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
	}			}

	static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,			static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
	const AArch64InstrInfo *TII) {			AliasAnalysis *AA) {
	// One of the instructions must modify memory.			// One of the instructions must modify memory.
	if (!MIa.mayStore() && !MIb.mayStore())			if (!MIa.mayStore() && !MIb.mayStore())
	return false;			return false;

	// Both instructions must be memory operations.			// Both instructions must be memory operations.
	if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())			if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
	return false;			return false;

	return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);			return MIa.mayAlias(AA, MIb, /UseTBAA/false);
	}			}

	static bool mayAlias(MachineInstr &MIa,			static bool mayAlias(MachineInstr &MIa,
	SmallVectorImpl<MachineInstr *> &MemInsns,			SmallVectorImpl<MachineInstr *> &MemInsns,
	const AArch64InstrInfo *TII) {			AliasAnalysis *AA) {
	for (MachineInstr *MIb : MemInsns)			for (MachineInstr *MIb : MemInsns)
	if (mayAlias(MIa, *MIb, TII))			if (mayAlias(MIa, *MIb, AA))
	return true;			return true;

	return false;			return false;
	}			}

	bool AArch64LoadStoreOpt::findMatchingStore(			bool AArch64LoadStoreOpt::findMatchingStore(
	MachineBasicBlock::iterator I, unsigned Limit,			MachineBasicBlock::iterator I, unsigned Limit,
	MachineBasicBlock::iterator &StoreI) {			MachineBasicBlock::iterator &StoreI) {
	▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines		do {
	trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);			trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);

	// Otherwise, if the base register is modified, we have no match, so			// Otherwise, if the base register is modified, we have no match, so
	// return early.			// return early.
	if (ModifiedRegs[BaseReg])			if (ModifiedRegs[BaseReg])
	return false;			return false;

	// If we encounter a store aliased with the load, return early.			// If we encounter a store aliased with the load, return early.
	if (MI.mayStore() && mayAlias(LoadMI, MI, TII))			if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
	return false;			return false;
	} while (MBBI != B && Count < Limit);			} while (MBBI != B && Count < Limit);
	return false;			return false;
	}			}

	// Returns true if FirstMI and MI are candidates for merging or pairing.			// Returns true if FirstMI and MI are candidates for merging or pairing.
	// Otherwise, returns false.			// Otherwise, returns false.
	static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,			static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
	▲ Show 20 Lines • Show All 153 Lines • ▼ Show 20 Lines		if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
	}			}

	// If the Rt of the second instruction was not modified or used between			// If the Rt of the second instruction was not modified or used between
	// the two instructions and none of the instructions between the second			// the two instructions and none of the instructions between the second
	// and first alias with the second, we can combine the second into the			// and first alias with the second, we can combine the second into the
	// first.			// first.
	if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&			if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
	!(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&			!(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
	!mayAlias(MI, MemInsns, TII)) {			!mayAlias(MI, MemInsns, AA)) {
	Flags.setMergeForward(false);			Flags.setMergeForward(false);
	return MBBI;			return MBBI;
	}			}

	// Likewise, if the Rt of the first instruction is not modified or used			// Likewise, if the Rt of the first instruction is not modified or used
	// between the two instructions and none of the instructions between the			// between the two instructions and none of the instructions between the
	// first and the second alias with the first, we can combine the first			// first and the second alias with the first, we can combine the first
	// into the second.			// into the second.
	if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&			if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
	!(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&			!(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
	!mayAlias(FirstMI, MemInsns, TII)) {			!mayAlias(FirstMI, MemInsns, AA)) {
	Flags.setMergeForward(true);			Flags.setMergeForward(true);
	return MBBI;			return MBBI;
	}			}
	// Unable to combine these instructions due to interference in between.			// Unable to combine these instructions due to interference in between.
	// Keep looking.			// Keep looking.
	}			}
	}			}

	▲ Show 20 Lines • Show All 526 Lines • ▼ Show 20 Lines

	bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {			bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
	if (skipFunction(*Fn.getFunction()))			if (skipFunction(*Fn.getFunction()))
	return false;			return false;

	Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());			Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
	TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());			TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
	TRI = Subtarget->getRegisterInfo();			TRI = Subtarget->getRegisterInfo();
				AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();

	// Resize the modified and used register bitfield trackers. We do this once			// Resize the modified and used register bitfield trackers. We do this once
	// per function and then clear the bitfield each time we optimize a load or			// per function and then clear the bitfield each time we optimize a load or
	// store.			// store.
	ModifiedRegs.resize(TRI->getNumRegs());			ModifiedRegs.resize(TRI->getNumRegs());
	UsedRegs.resize(TRI->getNumRegs());			UsedRegs.resize(TRI->getNumRegs());

	bool Modified = false;			bool Modified = false;
	Show All 22 Lines

test/CodeGen/AArch64/ldst-opt-aa.mir

This file was added.

				# RUN: llc -mtriple=aarch64--linux-gnu -run-pass=aarch64-ldst-opt %s -verify-machineinstrs -o - \| FileCheck %s
				--- \|
				define void @ldr_str_aa(i32* noalias nocapture %x, i32* noalias nocapture readonly %y) {
				entry:
				%0 = load i32, i32* %y, align 4
				store i32 %0, i32* %x, align 4
				%arrayidx2 = getelementptr inbounds i32, i32* %y, i32 1
				%1 = load i32, i32* %arrayidx2, align 4
				%arrayidx3 = getelementptr inbounds i32, i32* %x, i32 1
				store i32 %1, i32* %arrayidx3, align 4
				ret void
				}

				...
				---
				# CHECK-LABEL: name: ldr_str_aa
				# CHECK: %w8, %w9 = LDPWi %x1, 0
				# CHECK: STPWi %w8, %w9, %x0, 0
				name: ldr_str_aa
				tracksRegLiveness: true
				body: \|
				bb.0.entry:
				liveins: %x0, %x1

				%w8 = LDRWui %x1, 0 :: (load 4 from %ir.y)
				STRWui killed %w8, %x0, 0 :: (store 4 into %ir.x)
				%w9 = LDRWui killed %x1, 1 :: (load 4 from %ir.arrayidx2)
				STRWui killed %w9, killed %x0, 1 :: (store 4 into %ir.arrayidx3)
				RET undef %lr