This is an archive of the discontinued LLVM Phabricator instance.

128 is a really big number for LoopMicroOpBufferSize. You might want to consider modifying AArch64TTIImpl::getUnrollingPreferences with some more tailored heuristics.

Also, it would be nice to see the impact across a wider set of benchmarks, like the LLVM testsuite, so it's clear what impact more aggressive unrolling has in general.

In D40695#941589, @efriedma wrote:

128 is a really big number for LoopMicroOpBufferSize.

It appears to be the optimal number for T99, after having spent a lot of time moving it up and down in steps of 2 and testing the effects.

Larger than 128 doesn't improve performance or increase unrolling -- at least on SPECcpu2017, libquantum and Google Protobufs, on T99. Smaller than 128 impacts performance - 128 is always better than anything smaller than.

What would be nice is being able to set this parameter as a compile-time -mllvm -aarch64-loop-micro-ops-buffer-size option.

I have not tested other AArch64 micro-arch's, as I have no access to them.

You might want to consider modifying AArch64TTIImpl::getUnrollingPreferences with some more tailored heuristics.

That's something I would be happy to take a look at, but I am reluctant, for now, to make changes in AArch64TTIImpl::getUnrollingPreferences. That's a more involved change.

Also, it would be nice to see the impact across a wider set of benchmarks, like the LLVM testsuite, so it's clear what impact more aggressive unrolling has in general.

I make no claims that every single ISA or AArch64 micro-arch will benefit from increasing their LoopMicroOpsBufferSize. This is a micro-arch specific change for T99.

Also, for this T99 specific change, the LLVM testsuite probably isn't the best benchmark. The specific type of loops that benefit most from this change are loops that contain a large number of nested conditionals. There are many loops of this type in SPECcpu2017 and in libquantum (quantum_toffoli). I'm not sure this type of deeply-nested loop is that widespread in the LLVM testsuite.

LGTM.

Larger than 128 doesn't improve performance or increase unrolling -- at least on SPECcpu2017, libquantum and Google Protobufs, on T99.

For benchmarking, just wanted to make sure you did some wider testing; I don't care about the LLVM testsuite specifically.

That's something I would be happy to take a look at, but I am reluctant, for now, to make changes in AArch64TTIImpl::getUnrollingPreferences. That's a more involved change.

Sure, that's fine.

This revision is now accepted and ready to land.Nov 30 2017, 7:11 PM

fhahn added a subscriber: fhahn.Dec 1 2017, 1:27 AM

LGTM, as long as you are happy with the speedup.

Closed by commit rL320272: [AArch64] Improve loop unrolling performance on Cavium T99 (authored by joel_k_jones). · Explain WhyDec 9 2017, 4:00 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

AArch64/

AArch64SchedThunderX2T99.td

2 lines

test/

CodeGen/

AArch64/

loop-micro-op-buffer-size-t99.ll

124 lines

Diff 126285

llvm/trunk/lib/Target/AArch64/AArch64SchedThunderX2T99.td

	Show All 16 Lines
	// 2. Pipeline Description.			// 2. Pipeline Description.

	def ThunderX2T99Model : SchedMachineModel {			def ThunderX2T99Model : SchedMachineModel {
	let IssueWidth = 4; // 4 micro-ops dispatched at a time.			let IssueWidth = 4; // 4 micro-ops dispatched at a time.
	let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer.			let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer.
	let LoadLatency = 4; // Optimistic load latency.			let LoadLatency = 4; // Optimistic load latency.
	let MispredictPenalty = 12; // Extra cycles for mispredicted branch.			let MispredictPenalty = 12; // Extra cycles for mispredicted branch.
	// Determined via a mix of micro-arch details and experimentation.			// Determined via a mix of micro-arch details and experimentation.
	let LoopMicroOpBufferSize = 32;			let LoopMicroOpBufferSize = 128;
	let PostRAScheduler = 1; // Using PostRA sched.			let PostRAScheduler = 1; // Using PostRA sched.
	let CompleteModel = 1;			let CompleteModel = 1;

	list<Predicate> UnsupportedFeatures = [HasSVE];			list<Predicate> UnsupportedFeatures = [HasSVE];
	}			}

	// Define the issue ports.			// Define the issue ports.

	▲ Show 20 Lines • Show All 1,847 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AArch64/loop-micro-op-buffer-size-t99.ll

				; REQUIRES: asserts
				; RUN: opt -mcpu=thunderx2t99 -loop-unroll --debug-only=loop-unroll -S -unroll-allow-partial < %s 2>&1 \| FileCheck %s

				target triple = "aarch64-unknown-linux-gnu"

				; CHECK: Loop Unroll: F[foo] Loop %loop.2.header
				; CHECK: Loop Size = 19
				; CHECK: Trip Count = 512
				; CHECK: Trip Multiple = 512
				; CHECK: UNROLLING loop %loop.2.header by 4 with a breakout at trip 0
				; CHECK: Merging:
				; CHECK: Loop Unroll: F[foo] Loop %loop.header
				; CHECK: Loop Size = 18
				; CHECK: Trip Count = 512
				; CHECK: Trip Multiple = 512
				; CHECK: UNROLLING loop %loop.header by 4 with a breakout at trip 0
				; CHECK: Merging:
				; CHECK: %counter = phi i32 [ 0, %entry ], [ %inc.3, %loop.inc.3 ]
				; CHECK: %val = add nuw nsw i32 %counter, 5
				; CHECK: %val1 = add nuw nsw i32 %counter, 6
				; CHECK: %val2 = add nuw nsw i32 %counter, 7
				; CHECK: %val3 = add nuw nsw i32 %counter, 8
				; CHECK: %val4 = add nuw nsw i32 %counter, 9
				; CHECK: %val5 = add nuw nsw i32 %counter, 10
				; CHECK-NOT: %val = add i32 %counter, 5
				; CHECK-NOT: %val = add i32 %counter, 6
				; CHECK-NOT: %val = add i32 %counter, 7
				; CHECK-NOT: %val = add i32 %counter, 8
				; CHECK-NOT: %val = add i32 %counter, 9
				; CHECK-NOT: %val = add i32 %counter, 10
				; CHECK: %counter.2 = phi i32 [ 0, %exit.0 ], [ %inc.2.3, %loop.2.inc.3 ]

				define void @foo(i32 * %out) {
				entry:
				%0 = alloca [1024 x i32]
				%x0 = alloca [1024 x i32]
				%x01 = alloca [1024 x i32]
				%x02 = alloca [1024 x i32]
				%x03 = alloca [1024 x i32]
				%x04 = alloca [1024 x i32]
				%x05 = alloca [1024 x i32]
				%x06 = alloca [1024 x i32]
				br label %loop.header

				loop.header:
				%counter = phi i32 [0, %entry], [%inc, %loop.inc]
				br label %loop.body

				loop.body:
				%ptr = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 %counter
				store i32 %counter, i32* %ptr
				%val = add i32 %counter, 5
				%xptr = getelementptr [1024 x i32], [1024 x i32]* %x0, i32 0, i32 %counter
				store i32 %val, i32* %xptr
				%val1 = add i32 %counter, 6
				%xptr1 = getelementptr [1024 x i32], [1024 x i32]* %x01, i32 0, i32 %counter
				store i32 %val1, i32* %xptr1
				%val2 = add i32 %counter, 7
				%xptr2 = getelementptr [1024 x i32], [1024 x i32]* %x02, i32 0, i32 %counter
				store i32 %val2, i32* %xptr2
				%val3 = add i32 %counter, 8
				%xptr3 = getelementptr [1024 x i32], [1024 x i32]* %x03, i32 0, i32 %counter
				store i32 %val3, i32* %xptr3
				%val4 = add i32 %counter, 9
				%xptr4 = getelementptr [1024 x i32], [1024 x i32]* %x04, i32 0, i32 %counter
				store i32 %val4, i32* %xptr4
				%val5 = add i32 %counter, 10
				%xptr5 = getelementptr [1024 x i32], [1024 x i32]* %x05, i32 0, i32 %counter
				store i32 %val5, i32* %xptr5
				br label %loop.inc

				loop.inc:
				%inc = add i32 %counter, 2
				%1 = icmp sge i32 %inc, 1023
				br i1 %1, label %exit.0, label %loop.header

				exit.0:
				%2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 5
				%3 = load i32, i32* %2
				store i32 %3, i32 * %out
				br label %loop.2.header


				loop.2.header:
				%counter.2 = phi i32 [0, %exit.0], [%inc.2, %loop.2.inc]
				br label %loop.2.body

				loop.2.body:
				%ptr.2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 %counter.2
				store i32 %counter.2, i32* %ptr.2
				%val.2 = add i32 %counter.2, 5
				%xptr.2 = getelementptr [1024 x i32], [1024 x i32]* %x0, i32 0, i32 %counter.2
				store i32 %val.2, i32* %xptr.2
				%val1.2 = add i32 %counter.2, 6
				%xptr1.2 = getelementptr [1024 x i32], [1024 x i32]* %x01, i32 0, i32 %counter.2
				store i32 %val1, i32* %xptr1.2
				%val2.2 = add i32 %counter.2, 7
				%xptr2.2 = getelementptr [1024 x i32], [1024 x i32]* %x02, i32 0, i32 %counter.2
				store i32 %val2, i32* %xptr2.2
				%val3.2 = add i32 %counter.2, 8
				%xptr3.2 = getelementptr [1024 x i32], [1024 x i32]* %x03, i32 0, i32 %counter.2
				store i32 %val3.2, i32* %xptr3.2
				%val4.2 = add i32 %counter.2, 9
				%xptr4.2 = getelementptr [1024 x i32], [1024 x i32]* %x04, i32 0, i32 %counter.2
				store i32 %val4.2, i32* %xptr4.2
				%val5.2 = add i32 %counter.2, 10
				%xptr5.2 = getelementptr [1024 x i32], [1024 x i32]* %x05, i32 0, i32 %counter.2
				store i32 %val5.2, i32* %xptr5.2
				%xptr6.2 = getelementptr [1024 x i32], [1024 x i32]* %x06, i32 0, i32 %counter.2
				store i32 %val5.2, i32* %xptr6.2
				br label %loop.2.inc

				loop.2.inc:
				%inc.2 = add i32 %counter.2, 2
				%4 = icmp sge i32 %inc.2, 1023
				br i1 %4, label %exit.2, label %loop.2.header

				exit.2:
				%x2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 6
				%x3 = load i32, i32* %x2
				%out2 = getelementptr i32, i32 * %out, i32 1
				store i32 %3, i32 * %out2
				ret void
				}

This is an archive of the discontinued LLVM Phabricator instance.

Improve loop unrolling performance on T99ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 126285

llvm/trunk/lib/Target/AArch64/AArch64SchedThunderX2T99.td

llvm/trunk/test/CodeGen/AArch64/loop-micro-op-buffer-size-t99.ll

Improve loop unrolling performance on T99
ClosedPublic